Skip to main content

structured_zstd/encoding/
match_generator.rs

1//! Matching algorithm used find repeated parts in the original data
2//!
3//! The Zstd format relies on finden repeated sequences of data and compressing these sequences as instructions to the decoder.
4//! A sequence basically tells the decoder "Go back X bytes and copy Y bytes to the end of your decode buffer".
5//!
6//! The task here is to efficiently find matches in the already encoded data for the current suffix of the not yet encoded data.
7
8use alloc::vec::Vec;
9// SIMD/CRC intrinsics now live in `crate::encoding::fastpath::*` where they
10// sit under per-CPU `#[target_feature]` umbrellas; no architecture-specific
11// intrinsic imports remain in this file.
12use super::CompressionLevel;
13use super::Matcher;
14use super::Sequence;
15use super::blocks::encode_offset_with_history;
16use super::bt::BtMatcher;
17#[cfg(test)]
18use super::cost_model::HC_MAX_LIT;
19use super::cost_model::{
20    HC_BITCOST_MULTIPLIER, HC_FORMAT_MINMATCH, HC_OPT_NODE_LEN, HC_OPT_NUM, HC_OPT_PRICE_ARENA_LEN,
21    HC_OPT_PRICE_STRIDE, HC_PREDEF_THRESHOLD, HcOptState, HcOptimalCostProfile,
22};
23#[cfg(test)]
24use super::cost_model::{HC_BLOCKSIZE_MAX, HC_MAX_LL, HC_MAX_ML, HC_MAX_OFF, HcOptPriceType};
25use super::dfast::DfastMatchGenerator;
26// FAST_HASH_FILL_STEP test-only re-export was tied to the legacy
27// SuffixStore MatchGenerator's interleaved hash-fill stride. The
28// upstream zstd-shape Fast kernel walks ip0 with kSearchStrength step-skip
29// acceleration instead, so the constant has no consumer in the
30// remaining live test set today.
31#[cfg(test)]
32use super::match_table::helpers::INCOMPRESSIBLE_SKIP_STEP;
33use super::match_table::helpers::MIN_MATCH_LEN;
34#[cfg(test)]
35use super::match_table::helpers::common_prefix_len;
36#[cfg(test)]
37use super::opt::ldm::HcRawSeq;
38use super::opt::ldm::{HcOptLdmState, HcRawSeqStore};
39use super::opt::types::{
40    HcCandidateQuery, HcOptimalNode, HcOptimalPlanBuffers, HcOptimalPlanState, HcOptimalSequence,
41    MatchCandidate,
42};
43use super::row::RowMatchGenerator;
44use super::simple::fast_matcher::{FAST_LEVEL_1_HASH_LOG, FAST_LEVEL_1_MLS, FastKernelMatcher};
45#[cfg(all(
46    test,
47    feature = "std",
48    target_arch = "aarch64",
49    target_endian = "little"
50))]
51use std::arch::is_aarch64_feature_detected;
52#[cfg(all(test, feature = "std", target_arch = "x86_64"))]
53use std::arch::is_x86_feature_detected;
54
55pub(crate) const DFAST_MIN_MATCH_LEN: usize = 5;
56// Bytes the dfast short hash reads (upstream zstd `mls = 5`). Seeding / lookahead
57// guards use it so a position is only short-hashed once its full 5-byte key
58// is in range.
59pub(crate) const DFAST_SHORT_HASH_LOOKAHEAD: usize = 5;
60pub(crate) const ROW_MIN_MATCH_LEN: usize = 5;
61// Upstream zstd `clevels.h:31` at level 3 large-input bucket sets
62// `hashLog = 17` (the long-hash table) and `chainLog = 16` (the
63// short-hash table — upstream zstd names this `chainTable` even though for
64// dfast it's used as a plain single-slot hash). Each table holds one
65// `U32` per slot; the upstream zstd overwrites on collision and recovers
66// compression quality via the inline `_search_next_long` retry
67// (after a short-hash hit, probes `hashLong[hl1]` at `ip + 1` and
68// keeps the longer match).
69//
70// We mirror that storage layout: single `u32` per bucket (no
71// `[u32; N]` array), `long_hash` sized `1 << DFAST_HASH_BITS` and
72// `short_hash` one bit smaller via `DFAST_SHORT_HASH_BITS_DELTA`.
73// Two-table footprint at Level 3: `2^17 × 4 + 2^16 × 4 = 768 KiB`,
74// exact upstream parity. The `_search_next_long` retry lives in
75// `DfastMatchGenerator::hash_candidate` (called via
76// `best_match`). Earlier revisions kept a
77// 4-slot bucket per hash position; that paid 4× the upstream zstd memory
78// without measurable ratio gain once the retry was in place.
79//
80// `dfast_hash_bits_for_window` still clamps the runtime long-hash
81// value to `[MIN_WINDOW_LOG, DFAST_HASH_BITS]`, so this const is the
82// upper bound rather than a fixed default.
83pub(crate) const DFAST_HASH_BITS: usize = 17;
84/// Difference between `long_hash_bits` and `short_hash_bits` —
85/// upstream zstd `hashLog - chainLog` is 1 at every dfast level (`clevels.h`
86/// level 2: 16-15=1; level 3: 17-16=1). The short hash is one bit
87/// smaller than the long hash so the per-bucket footprint matches
88/// upstream zstd sizing exactly.
89pub(crate) const DFAST_SHORT_HASH_BITS_DELTA: usize = 1;
90/// Sentinel value for an empty slot in the dfast hash tables. Real
91/// positions are stored as `(abs_pos - position_base + 1) as u32`, so
92/// `0` is reserved as the "empty" marker and a true relative offset
93/// of `0` never appears in the table. Mirrors the LDM table's
94/// `LdmEntry.offset == 0` convention (see `encoding/ldm/table.rs`)
95/// so both rebasing structures share
96/// one sentinel scheme.
97pub(crate) const DFAST_EMPTY_SLOT: u32 = 0;
98
99/// Guard band reserved above the high-water mark before triggering a
100/// rebase on the Dfast hash tables. When the next insert would push a
101/// relative offset above `u32::MAX - DFAST_REBASE_GUARD_BAND`, the
102/// table calls `reduce(GUARD_BAND)` to shift every slot down and
103/// advance `position_base` so future inserts stay inside the `u32`
104/// window. Same scheme as `encoding/ldm/table.rs`.
105pub(crate) const DFAST_REBASE_GUARD_BAND: u32 = 1u32 << 30;
106pub(crate) const DFAST_SKIP_SEARCH_STRENGTH: usize = 6;
107pub(crate) const DFAST_SKIP_STEP_GROWTH_INTERVAL: usize = 1 << DFAST_SKIP_SEARCH_STRENGTH;
108pub(crate) const DFAST_MAX_SKIP_STEP: usize = 8;
109pub(crate) const DFAST_INCOMPRESSIBLE_SKIP_STEP: usize = 16;
110pub(crate) const ROW_HASH_BITS: usize = 20;
111pub(crate) const ROW_LOG: usize = 5;
112pub(crate) const ROW_SEARCH_DEPTH: usize = 16;
113pub(crate) const ROW_TARGET_LEN: usize = 48;
114pub(crate) const ROW_TAG_BITS: usize = 8;
115pub(crate) const ROW_EMPTY_SLOT: u32 = u32::MAX;
116pub(crate) const ROW_HASH_KEY_LEN: usize = 4;
117// HASH_MIX_PRIME now lives in `crate::encoding::fastpath::scalar`; the four
118// per-CPU `hash_mix_u64` variants share it via that module.
119// HC_PRIME3BYTES / HC_PRIME4BYTES moved to match_table::storage
120// alongside the hash helpers in Phase 1e Stage A. Only the test
121// module references the constants directly (production code goes
122// through `MatchTable::hash_value_with_mls`).
123#[cfg(test)]
124use super::match_table::storage::{HC_PRIME3BYTES, HC_PRIME4BYTES};
125
126// HC_HASH_LOG / HC_CHAIN_LOG / HC3_HASH_LOG / HC_EMPTY live on the
127// shared storage module so MatchTable methods can reference them
128// without pulling in this module. Re-imported here so existing
129// macros / configs / tests keep their unqualified names.
130#[cfg(test)]
131use super::match_table::storage::HC_EMPTY;
132use super::match_table::storage::HC3_HASH_LOG;
133// HC_HASH_LOG / HC_CHAIN_LOG feed the test-only `HC_CONFIG` default.
134#[cfg(test)]
135use super::match_table::storage::{HC_CHAIN_LOG, HC_HASH_LOG};
136// HC3_MAX_OFFSET moved to encoding::bt alongside the hash3 candidate
137// probe macro that consumes it; the macro references it via the
138// fully-qualified `$crate::encoding::bt::HC3_MAX_OFFSET` path so this
139// module no longer needs a local import.
140const HC_SEARCH_DEPTH: usize = 16;
141// HC_MIN_MATCH_LEN moved to encoding::hc; re-imported here so
142// existing references compile unchanged.
143use super::hc::HC_MIN_MATCH_LEN;
144const HC_OPT_MIN_MATCH_LEN: usize = HC_FORMAT_MINMATCH;
145const HC_TARGET_LEN: usize = 48;
146
147// MAX_HC_SEARCH_DEPTH moved to encoding::hc alongside chain_candidates.
148use super::hc::MAX_HC_SEARCH_DEPTH;
149
150// `Strategy` and `StrategyTag` live in `crate::encoding::strategy`.
151// The driver carries a `StrategyTag` field set at `reset()` and
152// dispatches each block into a monomorphised `compress_block::<S>`
153// per concrete strategy.
154
155/// Bundled tuning knobs for the hash-chain matcher. Using a typed config
156/// instead of positional `usize` args eliminates parameter-order hazards.
157#[derive(Copy, Clone, PartialEq, Eq)]
158struct HcConfig {
159    hash_log: usize,
160    chain_log: usize,
161    search_depth: usize,
162    target_len: usize,
163    /// Binary-tree finder hash width (upstream zstd `mls = BOUNDED(4, minMatch, 6)`),
164    /// carried explicitly per level so it is NOT inferred from `target_len`
165    /// (a `target_length` override must not silently flip the finder between
166    /// 5- and 4-byte hashing). Only the BT body reads it; HC/lazy levels keep
167    /// it at 4 (their `hash_position` is always 4-byte). 5 for the
168    /// minMatch=5 BT levels (btlazy2 + btopt L16), 4 elsewhere.
169    search_mls: usize,
170}
171
172#[derive(Copy, Clone, PartialEq, Eq)]
173pub(crate) struct RowConfig {
174    pub(crate) hash_bits: usize,
175    pub(crate) row_log: usize,
176    pub(crate) search_depth: usize,
177    pub(crate) target_len: usize,
178    /// Upstream zstd `cParams.minMatch` for the row matcher: the regular-search
179    /// acceptance floor (a row candidate must extend to >= `mls` bytes).
180    /// The C-like advanced API surfaces this as the row min-match knob.
181    /// `ROW_MIN_MATCH_LEN` (5) is the default; the row hash key width stays
182    /// 4 bytes (an internal detail), so this only tunes the acceptance
183    /// floor, not the candidate hash distribution.
184    pub(crate) mls: usize,
185}
186
187// Only used as the default HashChain config when the test-only parse×search
188// override pairs a level with a backend its native row doesn't populate.
189#[cfg(test)]
190const HC_CONFIG: HcConfig = HcConfig {
191    hash_log: HC_HASH_LOG,
192    chain_log: HC_CHAIN_LOG,
193    search_depth: HC_SEARCH_DEPTH,
194    target_len: HC_TARGET_LEN,
195    search_mls: 4,
196};
197
198/// Base HashChain config synthesized when a public-parameter strategy
199/// override ([`super::parameters`]) routes a level to the HC / BT
200/// backend whose native level row didn't populate `hc` (e.g. forcing
201/// `Strategy::Lazy2` onto a level the table resolves to Fast). Mirrors
202/// the mid-band lazy defaults; the per-knob overrides then refine it.
203const HC_OVERRIDE_DEFAULT: HcConfig = HcConfig {
204    hash_log: super::match_table::storage::HC_HASH_LOG,
205    chain_log: super::match_table::storage::HC_CHAIN_LOG,
206    search_depth: HC_SEARCH_DEPTH,
207    target_len: HC_TARGET_LEN,
208    search_mls: 4,
209};
210
211const BTULTRA2_HC_CONFIG: HcConfig = HcConfig {
212    hash_log: 24,
213    chain_log: 24,
214    search_depth: 512,
215    target_len: 256,
216    search_mls: 4,
217};
218
219const BTULTRA2_HC_CONFIG_L22: HcConfig = HcConfig {
220    hash_log: 25,
221    chain_log: 27,
222    search_depth: 512,
223    target_len: 999,
224    search_mls: 4,
225};
226
227const BTULTRA2_HC_CONFIG_L22_256K: HcConfig = HcConfig {
228    hash_log: 19,
229    chain_log: 19,
230    search_depth: 1 << 13,
231    target_len: 999,
232    search_mls: 4,
233};
234
235const BTULTRA2_HC_CONFIG_L22_128K: HcConfig = HcConfig {
236    hash_log: 17,
237    chain_log: 18,
238    search_depth: 1 << 11,
239    target_len: 999,
240    search_mls: 4,
241};
242
243const BTULTRA2_HC_CONFIG_L22_16K: HcConfig = HcConfig {
244    hash_log: 15,
245    chain_log: 15,
246    search_depth: 1 << 10,
247    target_len: 999,
248    search_mls: 4,
249};
250
251// Default Row config: only used by tests and the test-only parse×search
252// override (production greedy L5 carries its own `ROW_L5`).
253#[cfg(test)]
254const ROW_CONFIG: RowConfig = RowConfig {
255    hash_bits: ROW_HASH_BITS,
256    row_log: ROW_LOG,
257    search_depth: ROW_SEARCH_DEPTH,
258    target_len: ROW_TARGET_LEN,
259    mls: ROW_MIN_MATCH_LEN,
260};
261
262// Level-5 greedy is the ONLY strategy routed to the Row backend
263// (`StrategyTag::backend`: greedy -> Row; lazy / btopt / btultra* ->
264// HashChain), so it is the only level whose `row:` field is read. The upstream zstd
265// `clevels.h` default row (srcSize > 256 KB) for level 5 is searchLog=3,
266// targetLength=2, from which the row matcher derives:
267//   rowLog       = clamp(searchLog, 4, 6) = 4
268//   search_depth = 1 << min(searchLog, rowLog) = 8   (= nbAttempts)
269//   target_len   = targetLength = 2                  (nice-match early-out)
270// The shared `ROW_CONFIG` (row_log=5, search_depth=16, target_len=48) ran a
271// level-12-grade search here: 16 slots per row, never early-exiting until a
272// 48-byte match. That exhaustive walk was the dominant cost in greedy L5's
273// encode-speed regression vs FFI. `hash_bits` matches upstream zstd's
274// `ZSTD_getCParams(5, .., 0).hashLog` = 19 (verified via
275// `cparams_check 5`), so the row table is the same width as upstream's
276// (2^19 slots); the previous `ROW_HASH_BITS` (20) doubled both row tables vs
277// upstream, the dominant peak-memory excess on the greedy band.
278const ROW_L5: RowConfig = RowConfig {
279    hash_bits: 19,
280    row_log: 4,
281    search_depth: 8,
282    target_len: 2,
283    mls: ROW_MIN_MATCH_LEN,
284};
285
286// Upstream zstd `clevels.h` unbounded defaults for the lazy band, verified via
287// `ZSTD_getCParams(level, 0, 0)`:
288//   L6  { w21 c18 h19 s3 mml5 t4  lazy  } → rowLog 4, depth 1<<3 = 8
289//   L7  { w21 c19 h20 s4 mml5 t8  lazy  } → rowLog 4, depth 16
290//   L8  { w21 c19 h20 s4 mml5 t16 lazy2 } → rowLog 4, depth 16
291//   L9  { w22 c20 h21 s4 mml5 t16 lazy2 } → rowLog 4, depth 16
292//   L10 { w22 c21 h22 s5 mml5 t16 lazy2 } → rowLog 5, depth 32
293//   L11 { w22 c21 h22 s6 mml5 t16 lazy2 } → rowLog 6, depth 64
294//   L12 { w22 c22 h23 s6 mml5 t32 lazy2 } → rowLog 6, depth 64
295// `rowLog = clamp(searchLog, 4, 6)`, `depth = 1 << min(searchLog, rowLog)`
296// (same derivation as `ROW_L5` above). `hash_bits` carries the upstream zstd
297// `hashLog`; the hinted-source clamp in `configure` caps it by the window
298// exactly like the upstream zstd `ZSTD_adjustCParams` path.
299const ROW_L6: RowConfig = RowConfig {
300    hash_bits: 19,
301    row_log: 4,
302    search_depth: 8,
303    target_len: 4,
304    mls: ROW_MIN_MATCH_LEN,
305};
306const ROW_L7: RowConfig = RowConfig {
307    hash_bits: 20,
308    row_log: 4,
309    search_depth: 16,
310    target_len: 8,
311    mls: ROW_MIN_MATCH_LEN,
312};
313const ROW_L8: RowConfig = RowConfig {
314    hash_bits: 20,
315    row_log: 4,
316    search_depth: 16,
317    target_len: 16,
318    mls: ROW_MIN_MATCH_LEN,
319};
320const ROW_L9: RowConfig = RowConfig {
321    hash_bits: 21,
322    row_log: 4,
323    search_depth: 16,
324    target_len: 16,
325    mls: ROW_MIN_MATCH_LEN,
326};
327const ROW_L10: RowConfig = RowConfig {
328    hash_bits: 22,
329    row_log: 5,
330    search_depth: 32,
331    target_len: 16,
332    mls: ROW_MIN_MATCH_LEN,
333};
334const ROW_L11: RowConfig = RowConfig {
335    hash_bits: 22,
336    row_log: 6,
337    search_depth: 64,
338    target_len: 16,
339    mls: ROW_MIN_MATCH_LEN,
340};
341const ROW_L12: RowConfig = RowConfig {
342    hash_bits: 23,
343    row_log: 6,
344    search_depth: 64,
345    target_len: 32,
346    mls: ROW_MIN_MATCH_LEN,
347};
348
349/// Per-level Double-Fast hash sizing, mirroring the upstream zstd `clevels.h` columns
350/// (config-driven, not a hardcoded constant): `long_hash_log` =
351/// `cParams.hashLog` (the long 8-byte hash table), `short_hash_log` =
352/// `cParams.chainLog` (the short hash table dfast repurposes as its
353/// secondary index). Only the Dfast backend reads it, so non-dfast level
354/// rows carry `dfast: None`. `minMatch` stays the upstream zstd-fixed `5`
355/// (`DFAST_MIN_MATCH_LEN`, used in const contexts).
356#[derive(Copy, Clone, PartialEq, Eq)]
357struct DfastConfig {
358    long_hash_log: u8,
359    short_hash_log: u8,
360}
361
362// Upstream zstd clevels.h default row (srcSize > 256 KB): L3 {hashLog 17, chainLog 16},
363// L4 {hashLog 18, chainLog 18}.
364const DFAST_L3: DfastConfig = DfastConfig {
365    long_hash_log: 17,
366    short_hash_log: 16,
367};
368const DFAST_L4: DfastConfig = DfastConfig {
369    long_hash_log: 18,
370    short_hash_log: 18,
371};
372
373/// Per-level Fast-strategy tuning, only consumed by the `FastKernelMatcher`
374/// (Simple backend): `hash_log` = upstream zstd `cParams.hashLog`, `mls` = upstream zstd
375/// `cParams.minMatch` (4..=8), `step_size` = upstream zstd `stepSize`. Carried as
376/// `LevelParams.fast` (`Some` only on Fast level rows; `None` elsewhere).
377#[derive(Copy, Clone, PartialEq, Eq)]
378struct FastConfig {
379    hash_log: u32,
380    mls: u32,
381    step_size: usize,
382}
383
384const FAST_L1: FastConfig = FastConfig {
385    hash_log: 14,
386    mls: 7,
387    step_size: 2,
388};
389const FAST_L2: FastConfig = FastConfig {
390    hash_log: 16,
391    mls: 6,
392    step_size: 2,
393};
394
395/// Resolved tuning parameters for a compression level. The
396/// [`StrategyTag`] is the single source of truth for the backend
397/// family and the compile-time strategy consts; the runtime
398/// [`BackendTag`] used by the driver dispatcher is derived via
399/// [`StrategyTag::backend`] so the two cannot drift.
400#[derive(Copy, Clone, PartialEq, Eq)]
401struct LevelParams {
402    strategy_tag: super::strategy::StrategyTag,
403    /// Decoupled search-method axis. Independent of `strategy_tag`'s
404    /// parse half: a level can pair any parse (greedy / lazy depth via
405    /// `lazy_depth`) with any search backend here. Defaults to the
406    /// historical pairing (`strategy_tag.search()`) but is overridable
407    /// per level so the parse×search matrix can be swept and tuned.
408    search: super::strategy::SearchMethod,
409    window_log: u8,
410    lazy_depth: u8,
411    /// Per-strategy tuning. Exactly one is `Some` on each level row, matching
412    /// `strategy_tag`'s backend, so the table self-documents which knobs a
413    /// level actually consumes (the others are `None`, not dead placeholders):
414    /// `fast` for the Fast/Simple backend, `dfast` for Double-Fast, `hc` for
415    /// the HashChain (lazy / btopt / btultra*) backend, `row` for the Row
416    /// (greedy L5) backend.
417    fast: Option<FastConfig>,
418    dfast: Option<DfastConfig>,
419    hc: Option<HcConfig>,
420    row: Option<RowConfig>,
421}
422
423impl LevelParams {
424    /// Backend family (storage variant) for the driver dispatcher.
425    /// Derived from the decoupled `search` axis so a level can route to
426    /// a different search backend than its `strategy_tag` historically
427    /// implied.
428    fn backend(&self) -> super::strategy::BackendTag {
429        self.search.backend()
430    }
431
432    /// Parse mode derived from the decoupled `search` axis: the binary-tree
433    /// search path carries `ParseMode::Optimal`; every other search backend
434    /// derives greedy/lazy/lazy2 from `lazy_depth`. Reading `search` (not the
435    /// strategy tag) keeps the parse×search decoupling complete even when a
436    /// level whose tag is `Bt*` is overridden to a non-BT search backend.
437    fn parse(&self) -> super::strategy::ParseMode {
438        match self.search {
439            super::strategy::SearchMethod::BinaryTree => super::strategy::ParseMode::Optimal,
440            _ => super::strategy::ParseMode::from_lazy_depth(self.lazy_depth),
441        }
442    }
443
444    /// Cheap fingerprint pre-splitter level, the C-like `blockSplitterLevel`
445    /// knob. Mirrors the upstream zstd `splitLevels[]` table indexed by strategy in
446    /// `ZSTD_optimalBlockSize` (`{0,0,1,2,2,3,3,4,4,4}` over fast..btultra2):
447    /// fast=0, dfast=1, greedy=2, lazy=2, lazy2=3, btlazy2=3,
448    /// btopt/btultra/btultra2=4. We collapse the upstream zstd `lazy2` and `btlazy2`
449    /// strategies into the hash-chain `Lazy` tag, distinguished here by
450    /// `lazy_depth` (the level table runs both at depth 2), so depth 2 routes
451    /// to split level 3 to match the upstream zstd. `split_level == 0` routes to the
452    /// cheap from-borders heuristic; `1..=4` to byChunks with internal
453    /// sampling level `split_level - 1`. The `savings >= 3` gate in
454    /// `optimal_block_size` keeps incompressible data and the first full block
455    /// whole, so homogeneous frames are not over-split.
456    fn pre_split(&self) -> Option<u8> {
457        match self.strategy_tag {
458            super::strategy::StrategyTag::Fast => Some(0),
459            super::strategy::StrategyTag::Dfast => Some(1),
460            super::strategy::StrategyTag::Greedy => Some(2),
461            // The lazy2 / btlazy2 band (Lazy at lazy_depth >= 2, and Btlazy2)
462            // uses the rate-1 full-scan chunk splitter (4), NOT the rate-5
463            // sampler (3). The rate-5 sampler combined with the larger
464            // hash_log is sensitive enough to register a phantom statistical
465            // transition on perfectly homogeneous but periodic input (e.g. a
466            // repeating log-line stream whose period does not divide the 8 KB
467            // chunk size): the sampled bytes land on a different phase in each
468            // chunk, so two identical-distribution chunks look different and
469            // the block is split at 8 KB, then re-split on every window,
470            // cascading a large stream into hundreds of tiny blocks whose
471            // per-block headers dwarf the payload. The rate-1 scan reads every
472            // byte, so it sees periodic data as uniform and declines to split,
473            // while still finding genuine content boundaries (measured better
474            // ratio on the real decode corpus, and no longer expands a
475            // periodic stream vs a single full block). lazy/greedy keep the
476            // coarse samplers (lower hash_log => not sensitive enough to
477            // alias here).
478            super::strategy::StrategyTag::Lazy => {
479                if self.lazy_depth >= 2 {
480                    Some(4)
481                } else {
482                    Some(2)
483                }
484            }
485            super::strategy::StrategyTag::Btlazy2 => Some(4),
486            super::strategy::StrategyTag::BtOpt
487            | super::strategy::StrategyTag::BtUltra
488            | super::strategy::StrategyTag::BtUltra2 => Some(4),
489        }
490    }
491}
492
493/// Apply the public-parameter per-knob overrides (#27) onto the
494/// level-resolved [`LevelParams`], in place. Runs in [`Matcher::reset`]
495/// after the level params are computed and before backend selection, so
496/// a strategy override re-routes the backend uniformly. An all-`None`
497/// override is a no-op the caller skips via
498/// [`super::parameters::ParamOverrides::is_empty`], keeping the default
499/// level geometry byte-identical.
500fn apply_param_overrides(params: &mut LevelParams, ov: &super::parameters::ParamOverrides) {
501    use super::strategy::SearchMethod;
502
503    // 1. Strategy override re-derives tag / search / lazy depth.
504    if let Some(strategy) = ov.strategy {
505        let tag = strategy.tag();
506        params.strategy_tag = tag;
507        params.search = tag.search();
508        params.lazy_depth = strategy.lazy_depth();
509    }
510
511    // 2. Ensure the active backend's config row exists (synthesize a
512    //    default when a strategy override moved off the native row).
513    match params.search {
514        SearchMethod::Fast => {
515            params.fast.get_or_insert(FAST_L1);
516        }
517        SearchMethod::DoubleFast => {
518            params.dfast.get_or_insert(DFAST_L3);
519        }
520        SearchMethod::RowHash => {
521            params.row.get_or_insert(ROW_L5);
522        }
523        SearchMethod::HashChain | SearchMethod::BinaryTree => {
524            // A `Btlazy2` strategy override moved off a non-HC row needs the
525            // BT 5-byte finder hash (upstream zstd minMatch 5); other synthesized HC
526            // rows keep the 4-byte default. An explicit `min_match` override
527            // below refines this further.
528            params.hc.get_or_insert(HcConfig {
529                search_mls: if matches!(params.strategy_tag, super::strategy::StrategyTag::Btlazy2)
530                {
531                    5
532                } else {
533                    HC_OVERRIDE_DEFAULT.search_mls
534                },
535                ..HC_OVERRIDE_DEFAULT
536            });
537        }
538    }
539
540    // 3. window_log (bounds-checked at <= 30 by the builder).
541    if let Some(window_log) = ov.window_log {
542        params.window_log = window_log;
543    }
544
545    // 4. Per-backend numeric knobs map into the active config, mirroring
546    //    the upstream zstd `cParams` -> matcher translation documented on each
547    //    config struct.
548    match params.search {
549        SearchMethod::Fast => {
550            if let Some(fast) = params.fast.as_mut() {
551                if let Some(hash_log) = ov.hash_log {
552                    fast.hash_log = hash_log;
553                }
554                if let Some(min_match) = ov.min_match {
555                    fast.mls = min_match;
556                }
557            }
558        }
559        SearchMethod::DoubleFast => {
560            if let Some(dfast) = params.dfast.as_mut() {
561                // hashLog -> long table, chainLog -> short table (the
562                // dfast secondary index). Both bounds-checked <= 30, so
563                // the `u8` casts are lossless.
564                if let Some(hash_log) = ov.hash_log {
565                    dfast.long_hash_log = hash_log as u8;
566                }
567                if let Some(chain_log) = ov.chain_log {
568                    dfast.short_hash_log = chain_log as u8;
569                }
570            }
571        }
572        SearchMethod::RowHash => {
573            if let Some(row) = params.row.as_mut() {
574                // Row hash-table width override (mirrors dfast `long_hash_log`
575                // / hc `hash_log`). Row has no separate chain table — the
576                // per-row depth comes from `search_log` below — so only
577                // `hash_log` maps here; `chain_log` has no Row analogue.
578                if let Some(hash_log) = ov.hash_log {
579                    row.hash_bits = hash_log as usize;
580                }
581                if let Some(search_log) = ov.search_log {
582                    // Upstream zstd: rowLog = clamp(searchLog, 4, 6);
583                    //        nbAttempts = 1 << min(searchLog, rowLog).
584                    let row_log = (search_log as usize).clamp(4, 6);
585                    row.row_log = row_log;
586                    row.search_depth = 1usize << (search_log as usize).min(row_log);
587                }
588                if let Some(target_length) = ov.target_length {
589                    row.target_len = target_length as usize;
590                }
591                if let Some(min_match) = ov.min_match {
592                    row.mls = min_match as usize;
593                }
594            }
595        }
596        SearchMethod::HashChain | SearchMethod::BinaryTree => {
597            if let Some(hc) = params.hc.as_mut() {
598                if let Some(hash_log) = ov.hash_log {
599                    hc.hash_log = hash_log as usize;
600                }
601                if let Some(chain_log) = ov.chain_log {
602                    hc.chain_log = chain_log as usize;
603                }
604                if let Some(search_log) = ov.search_log {
605                    hc.search_depth = 1usize << search_log;
606                }
607                if let Some(target_length) = ov.target_length {
608                    hc.target_len = target_length as usize;
609                }
610                if let Some(min_match) = ov.min_match {
611                    // Upstream zstd `mls = BOUNDED(4, cParams.minMatch, 6)`: a BT
612                    // min_match override maps into the finder hash width. Only
613                    // the BT body reads `search_mls`; HC/lazy keep 4-byte
614                    // hashing regardless, so this is a no-op for them.
615                    hc.search_mls = (min_match as usize).clamp(4, 6);
616                }
617            }
618        }
619    }
620}
621
622/// Map the resolved runtime strategy to the upstream zstd LDM strategy ordinal
623/// (1..=9) that [`super::ldm::params::LdmParams::adjust_for`] expects.
624/// The collapsed `Lazy` tag splits on `lazy_depth` (lazy = 4, lazy2 = 5).
625#[cfg(feature = "hash")]
626fn ldm_strategy_ordinal(tag: super::strategy::StrategyTag, lazy_depth: u8) -> u32 {
627    use super::strategy::StrategyTag;
628    match tag {
629        StrategyTag::Fast => 1,
630        StrategyTag::Dfast => 2,
631        StrategyTag::Greedy => 3,
632        StrategyTag::Lazy => {
633            if lazy_depth >= 2 {
634                5
635            } else {
636                4
637            }
638        }
639        // Upstream zstd `ZSTD_btlazy2` ordinal.
640        StrategyTag::Btlazy2 => 6,
641        StrategyTag::BtOpt => 7,
642        StrategyTag::BtUltra => 8,
643        StrategyTag::BtUltra2 => 9,
644    }
645}
646
647/// `ceil(log2(size))` of a source-size hint, with a zero hint floored to
648/// [`MIN_WINDOW_LOG`]. This is the single quantization every hint-dependent
649/// matcher parameter is derived from: the window-log cap, the HC / Fast hash
650/// and chain widths, the Dfast / Row table widths, the L22 config buckets, and
651/// the Fast attach-vs-copy cutoff. Two hints sharing this value resolve to the
652/// identical matcher shape, which is why it (not the raw byte count) keys the
653/// primed-dictionary snapshot — see [`PrimedKey`]. Operates on the full `u64`
654/// so callers comparing a hint against a cutoff get the same bucketed decision
655/// here and at the driver, with no `as usize` truncation on 32-bit targets.
656pub(crate) fn source_size_ceil_log(size: u64) -> u8 {
657    if size == 0 {
658        MIN_WINDOW_LOG
659    } else {
660        (64 - (size - 1).leading_zeros()) as u8
661    }
662}
663
664/// Upstream zstd `ZSTD_shouldAttachDict` cutoff for the Fast strategy, as a ceil-log
665/// bucket: 8 KiB = `2^13`, and `bucket <= 13` is exactly `hint <= 8192` because
666/// the bucket is monotone in the hint. A hint at or below this (or unknown,
667/// `None`) ATTACHES the dictionary (a separate immutable table); a larger hint
668/// COPIES it into the live table. Shared by `reset` (which records the mode in
669/// the primed-snapshot key) and `prime_with_dictionary` (which acts on it).
670const FAST_ATTACH_DICT_CUTOFF_LOG: u8 = 13;
671
672/// Dfast counterpart of [`FAST_ATTACH_DICT_CUTOFF_LOG`]: upstream zstd
673/// `ZSTD_dictMatchState` attach cutoff for the double-fast strategy is 16 KiB
674/// (`2^14`), so small / unknown-size inputs ATTACH (separate immutable dict
675/// long+short tables + dual-probe in `start_matching_fast_loop`) and larger
676/// known-size inputs COPY (re-prime the dict into the live tables, where the
677/// dense scan matches it as window history). The attach build also self-gates
678/// on `use_fast_loop` inside `skip_matching_for_dict_attach` — only the
679/// fast-loop levels (L3 / Default / L0) carry the dual-probe.
680const DFAST_ATTACH_DICT_CUTOFF_LOG: u8 = 14;
681
682/// `ZSTD_dictMatchState` attach cutoff for the Row (greedy/lazy) strategy is
683/// 32 KiB (`2^15`, upstream zstd `attachDictSizeCutoffs`): small / unknown-size inputs
684/// ATTACH the dict into the separate immutable row index (bounded dual-probe in
685/// `row_candidate_rl`), larger known-size inputs dense-COPY into the live rows.
686const ROW_ATTACH_DICT_CUTOFF_LOG: u8 = 15;
687
688/// 32 KiB (`2^15`, upstream zstd `attachDictSizeCutoffs[ZSTD_lazy2]`): small /
689/// unknown-size inputs ATTACH the dict as a separate hash-chain dms (the dual
690/// search in `find_best_match` walks the live input chain + the dms), larger
691/// known-size inputs dense-COPY (merge the dict into the live chain and search
692/// the one combined chain).
693const HC_ATTACH_DICT_CUTOFF_LOG: u8 = 15;
694
695/// BT/optimal attach cutoff for `btlazy2` + `btopt`: 32 KiB (`2^15`, upstream
696/// zstd `attachDictSizeCutoffs[ZSTD_btlazy2]` == `[ZSTD_btopt]`). Small /
697/// unknown-size inputs ATTACH the dict as a separate DUBT dms; larger known-size
698/// inputs COPY the dict into the LIVE binary tree (upstream zstd
699/// `ZSTD_resetCCtx_byCopyingCDict`).
700const BT_OPT_ATTACH_DICT_CUTOFF_LOG: u8 = 15;
701
702/// BT/optimal attach cutoff for `btultra` + `btultra2`: 8 KiB (`2^13`, upstream
703/// zstd `attachDictSizeCutoffs[ZSTD_btultra]` == `[ZSTD_btultra2]`). The deepest
704/// parses copy the dict into the live tree past a much smaller source than the
705/// `btopt` tier, matching upstream's per-strategy cutoff table.
706const BT_ULTRA_ATTACH_DICT_CUTOFF_LOG: u8 = 13;
707
708// Source-size cap for the dfast hash bits when a size hint is present: a tiny
709// input needs no larger hash than its window. The upstream zstd `cParams.hashLog` /
710// `chainLog` (from `DfastConfig`) caps it from above at the call site.
711fn dfast_hash_bits_for_window(max_window_size: usize) -> usize {
712    let window_log = (usize::BITS - 1 - max_window_size.leading_zeros()) as usize;
713    window_log.max(MIN_WINDOW_LOG as usize)
714}
715
716fn row_hash_bits_for_window(max_window_size: usize) -> usize {
717    // Upstream zstd `ZSTD_adjustCParams_internal` cap: `hashLog <= windowLog + 1`.
718    // The `+ 1` is load-bearing for L12, whose upstream zstd hashLog (23) exceeds
719    // its windowLog (22) — a plain `windowLog` cap would shrink the L12
720    // table on EVERY hinted reset and split primed snapshots between
721    // hinted and unhinted frames that resolve to the identical geometry.
722    // No constant upper clamp: the old `ROW_HASH_BITS` (20) ceiling
723    // predates the lazy band moving onto Row (L9-12 carry upstream zstd hashLog
724    // 21-23).
725    let window_log = (usize::BITS - 1 - max_window_size.leading_zeros()) as usize;
726    (window_log + 1).max(MIN_WINDOW_LOG as usize)
727}
728
729/// `floor(log2(window))` for the HashChain table-log cap (upstream zstd
730/// `ZSTD_adjustCParams_internal`). The caller clamps the level's `hash_log` /
731/// `chain_log` from above with this so a small hinted input doesn't allocate the
732/// full level's tables.
733fn hc_hash_bits_for_window(max_window_size: usize) -> usize {
734    let window_log = (usize::BITS - 1 - max_window_size.leading_zeros()) as usize;
735    window_log.max(MIN_WINDOW_LOG as usize)
736}
737
738/// Parameter table for numeric compression levels 1–22.
739///
740/// Each entry maps a zstd compression level to the best-available matcher
741/// backend and tuning knobs. High levels map to dedicated parse modes:
742/// btopt (16-17), btultra (18), btultra2 (19-22) — matching upstream zstd
743/// `clevels.h` (level 19 is `ZSTD_btultra2`, not plain btultra).
744///
745/// Index 0 = level 1, index 21 = level 22.
746#[rustfmt::skip]
747const LEVEL_TABLE: [LevelParams; 22] = [
748    // Exactly one of fast/dfast/hc/row is Some per row, matching the strategy
749    // backend; the rest are None (not dead placeholders).
750    // Lvl  Strategy       wlog  lazy  per-strategy config
751    // ---  -------------- ----  ----  -------------------
752    /* 1 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Fast, search: super::strategy::SearchMethod::Fast, window_log: 19, lazy_depth: 0, fast: Some(FAST_L1), dfast: None, hc: None, row: None },
753    /* 2 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Fast, search: super::strategy::SearchMethod::Fast, window_log: 20, lazy_depth: 0, fast: Some(FAST_L2), dfast: None, hc: None, row: None },
754    /* 3 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Dfast, search: super::strategy::SearchMethod::DoubleFast, window_log: 21, lazy_depth: 1, fast: None, dfast: Some(DFAST_L3), hc: None, row: None },
755    /* 4 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Dfast, search: super::strategy::SearchMethod::DoubleFast, window_log: 21, lazy_depth: 1, fast: None, dfast: Some(DFAST_L4), hc: None, row: None },
756    // target_len column for L5..=L15 matches upstream zstd cParams.targetLength
757    // from clevels.h table[0] (default — srcSize > 256 KB). Upstream zstd uses
758    // it as the lazy outer loop's `sufficient_len` (nice-match) threshold.
759    // Inflating it above upstream zstd forces the chain walk to complete
760    // search_depth iterations instead of breaking on the first
761    // long-enough match — the dominant cost in the L5..=L15 speed
762    // regression vs FFI (see lazy_band_target_len_matches_default_table).
763    /* 5 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Greedy, search: super::strategy::SearchMethod::RowHash, window_log: 21, lazy_depth: 0, fast: None, dfast: None, hc: None, row: Some(ROW_L5) },
764    // L6-12: the upstream zstd runs the lazy/lazy2 strategies on the ROW-based
765    // match finder by default (`ZSTD_resolveRowMatchFinderMode`: row mode
766    // is on for greedy..lazy2 whenever SIMD is available) — a bounded
767    // SIMD tag scan per row instead of a pointer-chasing hash-chain walk.
768    // Our HashChain walk on these levels was ~75% of L10 wall time on the
769    // 1 MiB corpus (dependent chain-table loads). Same `RowConfig`
770    // derivation as `ROW_L5` above, upstream zstd values per level in the
771    // `ROW_L6..ROW_L12` comment block.
772    /* 6 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Lazy, search: super::strategy::SearchMethod::RowHash, window_log: 21, lazy_depth: 1, fast: None, dfast: None, hc: None, row: Some(ROW_L6) },
773    /* 7 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Lazy, search: super::strategy::SearchMethod::RowHash, window_log: 21, lazy_depth: 1, fast: None, dfast: None, hc: None, row: Some(ROW_L7) },
774    /* 8 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Lazy, search: super::strategy::SearchMethod::RowHash, window_log: 21, lazy_depth: 2, fast: None, dfast: None, hc: None, row: Some(ROW_L8) },
775    /* 9 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Lazy, search: super::strategy::SearchMethod::RowHash, window_log: 22, lazy_depth: 2, fast: None, dfast: None, hc: None, row: Some(ROW_L9) },
776    /*10 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Lazy, search: super::strategy::SearchMethod::RowHash, window_log: 22, lazy_depth: 2, fast: None, dfast: None, hc: None, row: Some(ROW_L10) },
777    /*11 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Lazy, search: super::strategy::SearchMethod::RowHash, window_log: 22, lazy_depth: 2, fast: None, dfast: None, hc: None, row: Some(ROW_L11) },
778    /*12 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Lazy, search: super::strategy::SearchMethod::RowHash, window_log: 22, lazy_depth: 2, fast: None, dfast: None, hc: None, row: Some(ROW_L12) },
779    // L13-15: reference uses btlazy2 (binary-tree finder) with searchLog 4/5/6
780    // (search_depth 16/32/64) and targetLength 32. We run the hash-chain Lazy
781    // parser here, so we mirror the reference search budget rather than inflate
782    // it: matching the table keeps speed near the reference and makes per-level
783    // perf divergences comparable. The binary-tree finder that would let a
784    // smaller searchLog find longer matches (and re-establish a strict ratio
785    // ladder above L12) is tracked separately; until it lands these levels sit
786    // close to L12 on hash-chain inputs by design.
787    /*13 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Btlazy2, search: super::strategy::SearchMethod::BinaryTree, window_log: 22, lazy_depth: 2, fast: None, dfast: None, hc: Some(HcConfig { hash_log: 22, chain_log: 22, search_depth: 16, target_len: 32, search_mls: 5 }), row: None },
788    /*14 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Btlazy2, search: super::strategy::SearchMethod::BinaryTree, window_log: 22, lazy_depth: 2, fast: None, dfast: None, hc: Some(HcConfig { hash_log: 23, chain_log: 22, search_depth: 32, target_len: 32, search_mls: 5 }), row: None },
789    /*15 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Btlazy2, search: super::strategy::SearchMethod::BinaryTree, window_log: 22, lazy_depth: 2, fast: None, dfast: None, hc: Some(HcConfig { hash_log: 23, chain_log: 23, search_depth: 64, target_len: 32, search_mls: 5 }), row: None },
790    /*16 */ LevelParams { strategy_tag: super::strategy::StrategyTag::BtOpt, search: super::strategy::SearchMethod::BinaryTree, window_log: 22, lazy_depth: 2, fast: None, dfast: None, hc: Some(HcConfig { hash_log: 22, chain_log: 22, search_depth: 32, target_len: 48, search_mls: 5 }), row: None },
791    /*17 */ LevelParams { strategy_tag: super::strategy::StrategyTag::BtOpt, search: super::strategy::SearchMethod::BinaryTree, window_log: 23, lazy_depth: 2, fast: None, dfast: None, hc: Some(HcConfig { hash_log: 22, chain_log: 23, search_depth: 32, target_len: 64, search_mls: 4 }), row: None },
792    /*18 */ LevelParams { strategy_tag: super::strategy::StrategyTag::BtUltra, search: super::strategy::SearchMethod::BinaryTree, window_log: 23, lazy_depth: 2, fast: None, dfast: None, hc: Some(HcConfig { hash_log: 22, chain_log: 23, search_depth: 64, target_len: 64, search_mls: 4 }), row: None },
793    /*19 */ LevelParams { strategy_tag: super::strategy::StrategyTag::BtUltra2, search: super::strategy::SearchMethod::BinaryTree, window_log: 23, lazy_depth: 2, fast: None, dfast: None, hc: Some(HcConfig { hash_log: 22, chain_log: 24, search_depth: 128, target_len: 256, search_mls: 4 }), row: None },
794    /*20 */ LevelParams { strategy_tag: super::strategy::StrategyTag::BtUltra2, search: super::strategy::SearchMethod::BinaryTree, window_log: 25, lazy_depth: 2, fast: None, dfast: None, hc: Some(HcConfig { hash_log: 23, chain_log: 25, search_depth: 128, target_len: 256, search_mls: 4 }), row: None },
795    /*21 */ LevelParams { strategy_tag: super::strategy::StrategyTag::BtUltra2, search: super::strategy::SearchMethod::BinaryTree, window_log: 26, lazy_depth: 2, fast: None, dfast: None, hc: Some(BTULTRA2_HC_CONFIG), row: None },
796    /*22 */ LevelParams { strategy_tag: super::strategy::StrategyTag::BtUltra2, search: super::strategy::SearchMethod::BinaryTree, window_log: 27, lazy_depth: 2, fast: None, dfast: None, hc: Some(BTULTRA2_HC_CONFIG_L22), row: None },
797];
798
799/// Upstream zstd `minSrcSize` assumption when building a dictionary's prepared cParams
800/// with an unknown source (`zstd_compress.c` `ZSTD_adjustCParams_internal`,
801/// `ZSTD_cpm_createCDict`: `if (dictSize && srcSize == UNKNOWN) srcSize =
802/// minSrcSize` where `minSrcSize = (1<<9) + 1`). Used by [`cdict_table_logs`].
803const DICT_MIN_SRC_SIZE: u64 = 513;
804
805/// Upstream zstd `ZSTD_dictAndWindowLog` (`zstd_compress.c`): the window log large
806/// enough to address both the source and the dictionary, used when downsizing
807/// the hash / chain logs for a dictionary-bearing compress. `window_log` is the
808/// (already source-clamped) compress window; `src_size` / `dict_size` are the
809/// assumed source and the dictionary length.
810fn dict_and_window_log(window_log: u8, src_size: u64, dict_size: u64) -> u32 {
811    if dict_size == 0 {
812        return window_log as u32;
813    }
814    let window_size: u64 = 1u64 << window_log;
815    // Plain `+` (matches upstream zstd `ZSTD_dictAndWindowLog`): `window_size` is
816    // `1 << window_log` (window_log <= 31) and dict/src are real data sizes
817    // (<= isize::MAX), so these u64 sums cannot overflow in practice.
818    let dict_and_window = dict_size + window_size;
819    if window_size >= dict_size + src_size {
820        // Window already covers source + dictionary.
821        window_log as u32
822    } else {
823        // ceil(log2(dictAndWindowSize)) = highbit32(x - 1) + 1.
824        source_size_ceil_log(dict_and_window) as u32
825    }
826}
827
828/// Upstream zstd `ZSTD_createCDict` table geometry: the `(hash_log, chain_log)` a
829/// dictionary's prepared match-finder tables get, mirroring
830/// `ZSTD_adjustCParams_internal` under `ZSTD_cpm_createCDict`. A dictionary
831/// supplies the long matches, so upstream zstd downsizes the table widths toward the
832/// dict-and-window log (assuming a `minSrcSize` source) while the live window
833/// stays source-sized. `window_log` is the resolved compress window; `hash_log`
834/// / `chain_log` are the level's own widths; `uses_bt` selects the binary-tree
835/// `cycleLog` (`chainLog - 1`) vs the hash-chain one (`chainLog`).
836fn cdict_table_logs(
837    window_log: u8,
838    hash_log: usize,
839    chain_log: usize,
840    uses_bt: bool,
841    dict_size: usize,
842) -> (usize, usize) {
843    let dict_size = dict_size as u64;
844    // createCDict assumes a minSrcSize source when the real size is unknown.
845    let src_size = DICT_MIN_SRC_SIZE;
846    // Source-size window resize (upstream zstd caps windowLog by ceil_log2(src+dict)).
847    // Plain `+`: src_size is the tiny DICT_MIN_SRC_SIZE constant and dict_size
848    // is a real dictionary length, so the u64 sum cannot overflow.
849    let tsize = src_size + dict_size;
850    let resized_window_log = (window_log as u32)
851        .min(source_size_ceil_log(tsize) as u32)
852        .max(1);
853    let daw = dict_and_window_log(resized_window_log as u8, src_size, dict_size);
854    // `ZSTD_cycleLog(chainLog, strategy)`: chainLog - 1 for binary-tree finders.
855    let cycle_log = (chain_log as u32).saturating_sub(uses_bt as u32);
856    let new_hash_log = if hash_log as u32 > daw + 1 {
857        (daw + 1) as usize
858    } else {
859        hash_log
860    };
861    let new_chain_log = if cycle_log > daw {
862        chain_log.saturating_sub((cycle_log - daw) as usize)
863    } else {
864        chain_log
865    };
866    (new_hash_log, new_chain_log)
867}
868
869/// Smallest window_log the encoder will use regardless of source size.
870pub(crate) const MIN_WINDOW_LOG: u8 = 10;
871/// Conservative floor for source-size-hinted window tuning.
872///
873/// Hinted windows below 16 KiB (`window_log < 14`) currently regress C-FFI
874/// interoperability on certain compressed-block patterns. Keep hinted
875/// windows at 16 KiB or larger until that compatibility gap is closed.
876const MIN_HINTED_WINDOW_LOG: u8 = 14;
877
878/// Adjust level parameters for a known source size.
879///
880/// This derives a cap from `ceil(log2(src_size))`, then clamps it to
881/// [`MIN_HINTED_WINDOW_LOG`] (16 KiB). A zero-byte size hint is treated as
882/// [`MIN_WINDOW_LOG`] for the raw ceil-log step and then promoted to the hinted
883/// floor. This keeps tables bounded for small inputs while preserving the
884/// encoder's baseline minimum supported window.
885/// For the HC backend, `hash_log` and `chain_log` are reduced
886/// proportionally.
887fn adjust_params_for_source_size(mut params: LevelParams, src_size: u64) -> LevelParams {
888    // Derive a source-size-based cap from ceil(log2(src_size)), then
889    // clamp first to MIN_WINDOW_LOG (baseline encoder minimum) and then to
890    // MIN_HINTED_WINDOW_LOG (16 KiB hinted floor). For tiny or zero hints we
891    // therefore keep a 16 KiB effective minimum window in hinted mode.
892    // Raw ceil(log2(src_size)) drives the internal table sizes. The
893    // advertised `window_log` is separately floored at MIN_HINTED_WINDOW_LOG
894    // (a decoder-interop requirement on the wire format), but the hash /
895    // chain table widths are internal and never appear in the frame, so they
896    // can track the actual source size below that floor.
897    let raw_src_log = source_size_ceil_log(src_size);
898    let src_log = raw_src_log.max(MIN_WINDOW_LOG).max(MIN_HINTED_WINDOW_LOG);
899    if src_log < params.window_log {
900        params.window_log = src_log;
901    }
902    // Internal match-finder tables are sized from `table_log` — the RAW
903    // source log (floored only at the baseline `MIN_WINDOW_LOG`), NOT the
904    // wire `window_log` floor. The table widths never appear in the frame, so
905    // for small inputs they can track the actual source size and avoid
906    // zeroing a window-sized table per frame; large inputs keep the level's
907    // widths. The cap is applied with the same per-backend headroom the
908    // level table uses, so the load factor (and match quality) is unchanged.
909    // The Dfast backend derives its table widths from the source in `reset`
910    // (`set_hash_bits` recomputes there), so it is not adjusted here. The Row
911    // backend's width IS capped here, mirroring the upstream zstd (see the Row branch).
912    let table_log = raw_src_log.max(MIN_WINDOW_LOG);
913    let backend = params.backend();
914    if backend == super::strategy::BackendTag::HashChain {
915        let hc = params
916            .hc
917            .as_mut()
918            .expect("HashChain level row carries an HcConfig");
919        if (table_log + 2) < hc.hash_log as u8 {
920            hc.hash_log = (table_log + 2) as usize;
921        }
922        if (table_log + 1) < hc.chain_log as u8 {
923            hc.chain_log = (table_log + 1) as usize;
924        }
925    } else if backend == super::strategy::BackendTag::Row {
926        let row = params
927            .row
928            .as_mut()
929            .expect("Row level row carries a RowConfig");
930        // Upstream zstd `ZSTD_adjustCParams_internal` (zstd_compress.c): once
931        // the window is source-capped, `hashLog <= windowLog + 1`. The row
932        // table is `2^hash_bits` slots, exactly upstream's row hashTable
933        // `2^hashLog` slots, so the same cap applies. Without it the row table
934        // stays at the level's unbounded width (e.g. L12 hash_bits 23 = 4x
935        // upstream's source-capped 21), the dominant peak-memory excess on the
936        // row band.
937        let row_cap = (table_log + 1) as usize;
938        if row_cap < row.hash_bits {
939            row.hash_bits = row_cap;
940        }
941    } else if backend == super::strategy::BackendTag::Simple {
942        let fast = params
943            .fast
944            .as_mut()
945            .expect("Fast level row carries a FastConfig");
946        let fast_cap = (table_log + 1) as u32;
947        if fast_cap < fast.hash_log {
948            fast.hash_log = fast_cap;
949        }
950    }
951    params
952}
953
954fn level22_btultra2_params_for_source_size(source_size: Option<u64>) -> LevelParams {
955    let mut hc = match source_size {
956        Some(size) if size <= 16 * 1024 => BTULTRA2_HC_CONFIG_L22_16K,
957        Some(size) if size <= 128 * 1024 => BTULTRA2_HC_CONFIG_L22_128K,
958        Some(size) if size <= 256 * 1024 => BTULTRA2_HC_CONFIG_L22_256K,
959        _ => BTULTRA2_HC_CONFIG_L22,
960    };
961    let mut window_log = match source_size {
962        Some(size) if size <= 16 * 1024 => 14,
963        Some(size) if size <= 128 * 1024 => 17,
964        Some(size) if size <= 256 * 1024 => 18,
965        _ => 27,
966    };
967    if let Some(size) = source_size
968        && size > 256 * 1024
969    {
970        let src_log = source_size_ceil_log(size);
971        window_log = window_log.min(src_log.max(MIN_WINDOW_LOG));
972        let adjusted_table_log = window_log as usize + 1;
973        hc.hash_log = hc.hash_log.min(adjusted_table_log);
974        hc.chain_log = hc.chain_log.min(adjusted_table_log);
975    }
976    LevelParams {
977        strategy_tag: super::strategy::StrategyTag::BtUltra2,
978        search: super::strategy::SearchMethod::BinaryTree,
979        window_log,
980        lazy_depth: 2,
981        fast: None,
982        dfast: None,
983        hc: Some(hc),
984        row: None,
985    }
986}
987
988/// Estimated steady-state heap footprint of a one-shot compression context
989/// at `level` (window history + match-finder tables + block staging), in
990/// bytes. Computed from the same per-level tuning table the encoder
991/// resolves at frame start, so the estimate tracks the real allocations;
992/// it is an upper-bound style budget figure, not an exact accounting.
993pub fn estimated_compression_workspace_bytes(level: CompressionLevel) -> usize {
994    use super::strategy::StrategyTag;
995    let params = resolve_level_params(level, None);
996    let window = 1usize << params.window_log;
997    // Mirror `configure()`: the HC3 short-match side table exists only on
998    // the btultra/btultra2 tags (minMatch 3), capped by the window log; the
999    // BT pointer-pair layout fits inside the `4 << chain_log` chain term
1000    // (pairs over `chain_log - 1` nodes).
1001    let wants_hash3 = matches!(
1002        params.strategy_tag,
1003        StrategyTag::BtUltra | StrategyTag::BtUltra2
1004    );
1005    let uses_bt = matches!(
1006        params.strategy_tag,
1007        StrategyTag::Btlazy2 | StrategyTag::BtOpt | StrategyTag::BtUltra | StrategyTag::BtUltra2
1008    );
1009    let tables = params.fast.map(|f| 4usize << f.hash_log).unwrap_or(0)
1010        + params
1011            .dfast
1012            .map(|d| (4usize << d.long_hash_log) + (4usize << d.short_hash_log))
1013            .unwrap_or(0)
1014        + params
1015            .hc
1016            .map(|h| {
1017                let hash3 = if wants_hash3 {
1018                    4usize
1019                        << super::match_table::storage::HC3_HASH_LOG.min(params.window_log as usize)
1020                } else {
1021                    0
1022                };
1023                (4usize << h.hash_log) + (4usize << h.chain_log) + hash3
1024            })
1025            .unwrap_or(0)
1026        + params
1027            .row
1028            .map(|r| (4usize << r.hash_bits) + (2usize << r.hash_bits))
1029            .unwrap_or(0);
1030    // BT modes box a `BtMatcher`; its retained scratch layout is budgeted
1031    // next to the struct so estimator and allocator evolve together.
1032    let bt = if uses_bt {
1033        super::bt::BtMatcher::estimated_workspace_bytes()
1034    } else {
1035        0
1036    };
1037    // Block staging: literal + sequence buffers plus the compressed-block
1038    // scratch, each bounded by the 128 KiB block size.
1039    let staging = 3 * (128 * 1024);
1040    window + tables + bt + staging
1041}
1042
1043/// Extra steady-state workspace the binary-tree strategies (ordinals 6..=9,
1044/// btlazy2..btultra2) retain beyond the hash/chain tables: the boxed matcher
1045/// plus its scratch arenas, and the HC3 short-match side table for
1046/// btultra/btultra2 (capped by the window log). 0 for non-BT ordinals.
1047pub fn estimated_bt_strategy_extra_bytes(strategy_ordinal: u32, window_log: u32) -> usize {
1048    if !(6..=9).contains(&strategy_ordinal) {
1049        return 0;
1050    }
1051    let hash3 = if matches!(strategy_ordinal, 8 | 9) {
1052        4usize << super::match_table::storage::HC3_HASH_LOG.min(window_log as usize)
1053    } else {
1054        0
1055    };
1056    super::bt::BtMatcher::estimated_workspace_bytes() + hash3
1057}
1058
1059/// Resolve a [`CompressionLevel`] to internal tuning parameters,
1060/// optionally adjusted for a known source size.
1061fn resolve_level_params(level: CompressionLevel, source_size: Option<u64>) -> LevelParams {
1062    if matches!(level, CompressionLevel::Level(22)) {
1063        return level22_btultra2_params_for_source_size(source_size);
1064    }
1065    let params = match level {
1066        CompressionLevel::Uncompressed => LevelParams {
1067            strategy_tag: super::strategy::StrategyTag::Fast,
1068            search: super::strategy::SearchMethod::Fast,
1069            // Uncompressed frames emit raw blocks and never reference
1070            // history; advertising a larger window only inflates
1071            // decoder-side buffer reservation. Stay at 17 (128 KiB).
1072            window_log: 17,
1073            lazy_depth: 0,
1074            // Beyond-upstream zstd: hash_log=14 (vs upstream zstd's 13) for 2× fewer
1075            // collisions on structured corpora. Upstream zstd's "base for negative"
1076            // row has targetLength=1 → step_size = 1 + 0 + 1 = 2.
1077            fast: Some(FastConfig {
1078                hash_log: 14,
1079                mls: 6,
1080                step_size: 2,
1081            }),
1082            dfast: None,
1083            hc: None,
1084            row: None,
1085        },
1086        CompressionLevel::Fastest => {
1087            // Only the Fast-specific cParams
1088            // (fast_hash_log / fast_mls / fast_step_size) align
1089            // with Uncompressed / negative-base row. window_log
1090            // stays at LEVEL_TABLE[0]'s value (19) — Fastest still
1091            // does real compression on a full window, unlike
1092            // Uncompressed which clamps to 17.
1093            let mut p = LEVEL_TABLE[0];
1094            p.fast = Some(FastConfig {
1095                hash_log: 14,
1096                mls: 6,
1097                step_size: 2,
1098            });
1099            p
1100        }
1101        CompressionLevel::Default => LEVEL_TABLE[2],
1102        CompressionLevel::Better => LEVEL_TABLE[6],
1103        // Level 13: the first dominant point of the deep-lazy band. The
1104        // mls-wide row key lifted the shallow band's ratio enough that
1105        // level 11 no longer strictly beats level 7 on the ladder corpus;
1106        // the `Best` alias belongs on a config that dominates everything
1107        // below it rather than on a hair-thin margin.
1108        CompressionLevel::Best => LEVEL_TABLE[12],
1109        CompressionLevel::Level(n) => {
1110            if n > 0 {
1111                let idx = (n as usize).min(CompressionLevel::MAX_LEVEL as usize) - 1;
1112                LEVEL_TABLE[idx]
1113            } else if n == 0 {
1114                // Level 0 = default, matching C zstd semantics.
1115                LEVEL_TABLE[CompressionLevel::DEFAULT_LEVEL as usize - 1]
1116            } else {
1117                // Negative levels — upstream zstd sets
1118                // targetLength = -level (clampedCompressionLevel),
1119                // yielding step_size = (-level) + 1 since
1120                // !(targetLength) = 0 when targetLength > 0.
1121                // So L-1..L-7 get step_size 2..8. Acceleration
1122                // gradient comes from larger step skipping more
1123                // positions per iter (faster, worse ratio).
1124                // Clamp to upstream zstd's MIN_LEVEL before negating so
1125                // i32::MIN can't overflow on `-n`.
1126                let clamped = n.max(CompressionLevel::MIN_LEVEL);
1127                let target_length = (-clamped) as usize;
1128                let step_size = target_length + 1;
1129                // Upstream zstd row-0 ("base for negative", clevels.h srcSize>256KB):
1130                // hashLog=13, minMatch=7. The 32 KiB hash table (2^13 * 4B)
1131                // is L1d-resident on contemporary cores, so every probe is an
1132                // L1 hit; hashLog=14 (64 KiB) overflows a 32 KiB L1d and turns
1133                // each probe into an L2 access. minMatch=7 (vs 6) skips
1134                // short-distance 6-byte matches: fewer sequences, less
1135                // extension/emit work, and parity with the upstream zstd's negative
1136                // ladder on both ratio and throughput.
1137                LevelParams {
1138                    strategy_tag: super::strategy::StrategyTag::Fast,
1139                    search: super::strategy::SearchMethod::Fast,
1140                    window_log: 19,
1141                    lazy_depth: 0,
1142                    fast: Some(FastConfig {
1143                        hash_log: 13,
1144                        mls: 7,
1145                        step_size,
1146                    }),
1147                    dfast: None,
1148                    hc: None,
1149                    row: None,
1150                }
1151            }
1152        }
1153    };
1154    if let Some(size) = source_size {
1155        adjust_params_for_source_size(params, size)
1156    } else {
1157        params
1158    }
1159}
1160
1161/// The cheap fingerprint pre-splitter level for a compression level (the
1162/// C-like `blockSplitterLevel`), resolved through the same per-level
1163/// `LevelParams` table as every other tuning knob. `None` keeps the whole
1164/// 128 KiB block. The frame loop reads this instead of hardcoding the
1165/// level→split mapping at the call site.
1166pub(crate) fn level_pre_split(level: CompressionLevel) -> Option<usize> {
1167    // Resolve through `resolve_level_params` directly — NOT via the legacy
1168    // `numeric_level()` alias — so named presets read the SAME table row as
1169    // every other tuning knob (`Best` maps to its own row there, which is
1170    // not the row its numeric alias points at). `Uncompressed` (raw
1171    // blocks) never splits.
1172    if matches!(level, CompressionLevel::Uncompressed) {
1173        return None;
1174    }
1175    resolve_level_params(level, None)
1176        .pre_split()
1177        .map(usize::from)
1178}
1179
1180/// Backend storage for [`MatchGeneratorDriver`]. Exactly one match-finder
1181/// state lives in the driver at a time — the active variant. Backend
1182/// transitions in [`Matcher::reset`] drain the current variant's allocations
1183/// into the shared `vec_pool` and then replace `storage` with a freshly
1184/// constructed variant for the new backend.
1185///
1186/// Replaces the prior pattern of four parallel fields (`match_generator`,
1187/// `dfast_match_generator: Option<…>`, `row_match_generator: Option<…>`,
1188/// `hc_match_generator: Option<…>`) + an `active_backend: BackendTag`
1189/// discriminator: the parallel layout kept drained inner structures
1190/// allocated across backend switches, and every per-frame/per-slice
1191/// driver operation had to dispatch on `active_backend` to pick the
1192/// right field. A single enum collapses the storage and makes the
1193/// dispatcher pattern-match on the storage variant directly — same
1194/// number of arms, but `storage.backend()` is now the canonical source
1195/// of truth and dead variants are dropped when the active backend
1196/// changes.
1197#[derive(Clone)]
1198enum MatcherStorage {
1199    /// Upstream zstd `ZSTD_fast` family. Constructed by
1200    /// [`MatchGeneratorDriver::new`] as the initial variant and
1201    /// re-selected by [`Matcher::reset`] for any [`CompressionLevel`]
1202    /// that `resolve_level_params` maps to [`StrategyTag::Fast`]
1203    /// (`Uncompressed`, `Fastest`, `Level(1)`, and any non-positive
1204    /// `Level(n)` not equal to `0`).
1205    Simple(FastKernelMatcher),
1206    /// Upstream zstd `ZSTD_dfast` family — two-table hash chain. Selected for
1207    /// any level that resolves to [`StrategyTag::Dfast`] in
1208    /// `resolve_level_params` (`Default`, `Level(0)`, `Level(2)`,
1209    /// `Level(3)`).
1210    Dfast(DfastMatchGenerator),
1211    /// Upstream zstd `ZSTD_greedy` family with row hashing. Selected for any
1212    /// level that resolves to [`StrategyTag::Greedy`] (currently
1213    /// `Level(4)` only).
1214    Row(RowMatchGenerator),
1215    /// Upstream zstd `ZSTD_lazy2` and the BT-based optimal modes
1216    /// (`btopt` / `btultra` / `btultra2`). Selected for any level that
1217    /// resolves to [`StrategyTag::Lazy`], [`StrategyTag::BtOpt`],
1218    /// [`StrategyTag::BtUltra`], or [`StrategyTag::BtUltra2`]
1219    /// (`Better`, `Best`, `Level(5..=22)`, and any `Level(n)` with
1220    /// `n > MAX_LEVEL` — `resolve_level_params` clamps positive
1221    /// numeric levels at `MAX_LEVEL = 22` via
1222    /// `Level(n).clamp(1, MAX_LEVEL)`, so `Level(23..=i32::MAX)` all
1223    /// land on `BtUltra2` here). The [`HcMatchGenerator`]'s internal
1224    /// [`HcBackend`] discriminator decides whether BT scratch is
1225    /// allocated.
1226    HashChain(HcMatchGenerator),
1227}
1228
1229impl MatcherStorage {
1230    /// Heap bytes the active backend variant holds (tables, history, scratch).
1231    fn heap_size(&self) -> usize {
1232        match self {
1233            Self::Simple(m) => m.heap_size(),
1234            Self::Dfast(m) => m.heap_size(),
1235            Self::Row(m) => m.heap_size(),
1236            Self::HashChain(m) => m.heap_size(),
1237        }
1238    }
1239
1240    /// [`super::strategy::BackendTag`] family of the active variant.
1241    fn backend(&self) -> super::strategy::BackendTag {
1242        use super::strategy::BackendTag;
1243        match self {
1244            Self::Simple(_) => BackendTag::Simple,
1245            Self::Dfast(_) => BackendTag::Dfast,
1246            Self::Row(_) => BackendTag::Row,
1247            Self::HashChain(_) => BackendTag::HashChain,
1248        }
1249    }
1250}
1251
1252/// This is the default implementation of the `Matcher` trait. It allocates and reuses the buffers when possible.
1253pub struct MatchGeneratorDriver {
1254    vec_pool: Vec<Vec<u8>>,
1255    /// Active match-finder state. Exactly one backend lives here at a
1256    /// time; [`Matcher::reset`] drains the previous variant into
1257    /// `vec_pool` before swapping in a freshly constructed variant for
1258    /// the new backend. `storage.backend()` is the canonical source of
1259    /// truth for the parse family; `strategy_tag` carries the
1260    /// compile-time strategy chosen at the last `reset()`.
1261    storage: MatcherStorage,
1262    // Compile-time strategy tag resolved at `reset()` from the
1263    // requested `CompressionLevel`'s `LevelParams`. The driver's
1264    // hot-block dispatcher in `blocks/compressed.rs` matches on
1265    // this tag to enter the corresponding `Strategy`
1266    // monomorphisation (`compress_block::<S>`).
1267    strategy_tag: super::strategy::StrategyTag,
1268    // Decoupled search-method axis resolved at `reset()` from
1269    // `LevelParams.search`. The per-block dispatcher routes on this
1270    // (not on `strategy_tag`) so a level's parse and search backend can
1271    // be chosen independently. The `BinaryTree` arm still consults
1272    // `strategy_tag` to pick the opt `Strategy` ZST.
1273    search: super::strategy::SearchMethod,
1274    // Decoupled parse-mode axis resolved at `reset()` from
1275    // `LevelParams::parse()`. Independent of `search`: greedy / lazy /
1276    // lazy2 can run on any non-opt search backend. The backends still
1277    // read their own `lazy_depth` (kept in sync at `reset()`); this is
1278    // the authoritative parse selector for the dispatcher.
1279    parse: super::strategy::ParseMode,
1280    /// Test-only per-level recipe override applied in `reset()` before
1281    /// backend selection. Lets the parse×search matrix be exercised
1282    /// without editing `LEVEL_TABLE`; never compiled into production.
1283    #[cfg(test)]
1284    config_override: Option<(super::strategy::SearchMethod, super::strategy::ParseMode)>,
1285    /// Fine-grained per-knob overrides from the public
1286    /// [`super::parameters::CompressionParameters`] surface (#27).
1287    /// `None` (or an all-`None` [`super::parameters::ParamOverrides`])
1288    /// keeps the resolved level geometry byte-identical to plain
1289    /// level-based compression. Applied in [`Matcher::reset`] after the
1290    /// level params are resolved, before backend selection. Persists
1291    /// across resets (it is frame configuration, not a one-shot) until
1292    /// the caller changes it.
1293    param_overrides: Option<super::parameters::ParamOverrides>,
1294    slice_size: usize,
1295    base_slice_size: usize,
1296    // Frame header window size must stay at the configured live-window budget.
1297    // Dictionary retention expands internal matcher capacity only.
1298    reported_window_size: usize,
1299    // Tracks currently retained bytes that originated from primed dictionary
1300    // history and have not been evicted yet.
1301    dictionary_retained_budget: usize,
1302    // Source size hint for next frame (set via set_source_size_hint, cleared on reset).
1303    source_size_hint: Option<u64>,
1304    // Dictionary content size for the next frame (set via set_dictionary_size_hint,
1305    // consumed on reset). When present on a binary-tree / hash-chain backend, the
1306    // match-finder hash/chain tables are sized from the DICTIONARY (upstream zstd CDict
1307    // economics: a loaded dictionary supplies the long matches, so the live tables
1308    // can shrink to the dict's size tier) while the eviction window stays
1309    // source-sized. Mirrors upstream zstd `ZSTD_getCParamRowSize`, which picks the cParams
1310    // table column from `dictSize` for a dictionary-bearing compress.
1311    dictionary_size_hint: Option<usize>,
1312    // Normalized `ceil_log2` bucket of the frame's source-size hint, captured at
1313    // `reset` (where `source_size_hint` is consumed) via [`source_size_ceil_log`].
1314    // `None` means the frame was unhinted. Drives `prime_with_dictionary`'s upstream zstd
1315    // `ZSTD_shouldAttachDict` mode for the Simple/Fast backend: `None` (unknown)
1316    // or `<= FAST_ATTACH_DICT_CUTOFF_LOG` → attach (separate dict table, 2-cursor
1317    // `compress_block_fast_dict`); larger → copy (dictionary primed into the live
1318    // table, 4-cursor `compress_block_fast`). The primed-snapshot key is the
1319    // resolved shape ([`reset_shape`](Self::reset_shape)), not this bucket.
1320    reset_size_log: Option<u8>,
1321    // Hint-resolved matcher shape from the last `reset`: the [`LevelParams`], the
1322    // active backend's applied Dfast/Row hash-table width (`0` for HC/Fast), the
1323    // Fast attach-vs-copy mode, and the active LDM override (#27). Combined with
1324    // the frame's level into the [`PrimedKey`] that keys the primed snapshot, so
1325    // it is only restored into a reset that resolved the identical matcher AND
1326    // LDM configuration. `None` before the first `reset`.
1327    reset_shape: Option<(
1328        LevelParams,
1329        usize,
1330        bool,
1331        Option<super::parameters::LdmOverride>,
1332    )>,
1333    // One-shot borrowed block range `[start, end)` staged by the borrowed
1334    // Fast frame path (`set_borrowed_block`) for the NEXT
1335    // `start_matching` / `skip_matching_with_hint`. `Some` routes that
1336    // call to the Simple backend's borrowed scan instead of the owned
1337    // committed-block path; consumed (reset to `None`) by the routed
1338    // call. Always `None` on the owned streaming path.
1339    borrowed_pending: Option<(usize, usize)>,
1340    /// CDict-equivalent: snapshot of the post-prime matcher state taken
1341    /// once after the first dictionary prime — the backend `storage`
1342    /// (hash tables + dictionary history + offset history + window) plus
1343    /// the driver-level `dictionary_retained_budget`, the only two pieces
1344    /// `prime_with_dictionary` writes. Subsequent frames restore this
1345    /// (a table memcpy) instead of re-hashing every dictionary position,
1346    /// mirroring upstream zstd `ZSTD_compressBegin_usingCDict` copying the
1347    /// precomputed `cdict->matchState`. Invalidated when the dictionary
1348    /// changes; keyed by the [`PrimedKey`] resolved matcher shape so a snapshot
1349    /// is only restored into a reset that produces the same matcher — see
1350    /// `restore_primed_dictionary`.
1351    primed: Option<(MatcherStorage, usize, PrimedKey)>,
1352}
1353
1354/// Identity of the matcher configuration a primed snapshot was captured under:
1355/// the FULLY RESOLVED matcher shape, not the raw source-size hint.
1356///
1357/// `reset()` resolves the hint into a [`LevelParams`] (window_log cap, the
1358/// HC/Fast table and search geometry, the parse depth/target-length that get
1359/// baked into the restored `storage`) plus, for the Dfast/Row backends, a
1360/// table-width derived from the hint's ceil-log bucket. The mapping from hint
1361/// to resolved shape is many-to-one: the source-size adjustment is monotone in
1362/// `ceil_log2(hint)`, and Level 22 additionally collapses several buckets onto
1363/// one upstream zstd tier (its `<= 16/128/256 KiB` thresholds). Keying on the raw hint
1364/// (or even its ceil-log bucket) therefore over-keys — two hints that resolve
1365/// to the identical matcher would each force a full re-prime. Keying on the
1366/// resolved (`params`, `table_bits`) pair restores across them.
1367///
1368/// `table_bits` is the hint-dependent hash-table width the ACTIVE backend
1369/// applied (`set_hash_bits` value for Dfast/Row; `0` for HC/Fast, whose widths
1370/// already live in `params`). The snapshot is only ever captured on the COPY
1371/// path (a hinted, above-cutoff frame), so `table_bits` is always the resolved
1372/// Dfast/Row value there, never the unhinted default.
1373///
1374/// `level` is kept alongside the resolved `params` because some stored matcher
1375/// state is derived from the level DIRECTLY, not through `params`: e.g. Dfast's
1376/// `use_fast_loop` is true for L3 but false for L4, yet L3 and L4 resolve to
1377/// byte-identical `params`. Without `level` a snapshot captured at L3 could be
1378/// restored into an L4 reset, installing the wrong `use_fast_loop`.
1379///
1380/// `fast_attach` records the Fast backend's attach-vs-copy mode
1381/// ([`FAST_ATTACH_DICT_CUTOFF_LOG`]) because that cutoff (8 KiB) falls INSIDE a
1382/// single resolved shape: an 8192- and an 8193-byte Level 1 hint both clamp to
1383/// window_log 14 with identical `params`/`table_bits`, yet 8192 attaches (a
1384/// separate dict table) while 8193 copies into the live table — two different
1385/// `storage` shapes. The frame compressor only captures/restores snapshots on
1386/// the copy path today, but keying on the mode keeps the snapshot identity
1387/// self-sufficient rather than relying on that external gate.
1388///
1389/// Restoring a snapshot whose key differs would reinstate the old `storage`
1390/// (and its `max_window_size` / table dimensions / parse params / dict-table
1391/// shape) under a reset that resolved a different shape — the encoder could
1392/// then search past the frame header's window and emit an undecodable match.
1393/// All fields must match before a restore is allowed.
1394#[derive(Clone, Copy, PartialEq, Eq)]
1395struct PrimedKey {
1396    level: super::CompressionLevel,
1397    params: LevelParams,
1398    table_bits: usize,
1399    fast_attach: bool,
1400    /// Fine-grained LDM override (#27) active at capture time. The
1401    /// snapshot's cloned `storage` carries `BtMatcher::ldm_producer`,
1402    /// which is configured from this override; restoring a snapshot
1403    /// captured under a different LDM configuration (enable flip or
1404    /// changed knobs) would reinstate a stale producer. `params` already
1405    /// pins `window_log` / `strategy_tag` (the rest of the producer's
1406    /// identity), so folding the override completes the LDM identity.
1407    /// `None` = LDM off, matching `ParamOverrides::ldm`.
1408    ldm: Option<super::parameters::LdmOverride>,
1409}
1410
1411impl MatchGeneratorDriver {
1412    /// `slice_size` sets the base block allocation size used for matcher input chunks.
1413    /// `max_slices_in_window` determines the initial window capacity at construction
1414    /// time. Effective window sizing is recalculated on every [`reset`](Self::reset)
1415    /// from the resolved compression level and optional source-size hint.
1416    pub(crate) fn new(slice_size: usize, max_slices_in_window: usize) -> Self {
1417        // Validate inputs before deriving window_log_init. Three
1418        // failure modes need explicit guards:
1419        //
1420        // 1. Zero args → `max_window_size = 0` → silent 1-byte
1421        //    degenerate window (useless).
1422        // 2. Multiplication overflow on `slice_size *
1423        //    max_slices_in_window` → wraps silently in release.
1424        // 3. `next_power_of_two` overflow when the product is
1425        //    above `1 << (usize::BITS - 1)` → modern Rust PANICS
1426        //    on overflow (older Rust returned 0).
1427        //
1428        // Catch all three at construction with a clear domain-
1429        // specific message via `assert!` + `checked_mul` +
1430        // `checked_next_power_of_two`, rather than letting either
1431        // mode produce a silent degenerate matcher OR a generic
1432        // panic deep in `FastKernelMatcher::with_params`.
1433        assert!(
1434            slice_size > 0,
1435            "MatchGeneratorDriver::new requires slice_size > 0 (got 0)",
1436        );
1437        assert!(
1438            max_slices_in_window > 0,
1439            "MatchGeneratorDriver::new requires max_slices_in_window > 0 (got 0)",
1440        );
1441        let max_window_size = max_slices_in_window
1442            .checked_mul(slice_size)
1443            .expect("MatchGeneratorDriver::new: slice_size * max_slices_in_window overflows usize");
1444        // Derive an effective window_log for the initial-state matcher.
1445        // `MatchGeneratorDriver::new` runs BEFORE any reset, so it has
1446        // no LevelParams to consult — we initialise to whatever
1447        // window_log fits the caller's requested max_window_size
1448        // (round up to the next power of two via `next_power_of_two`'s
1449        // log). Reset() overwrites all three params from the resolved
1450        // LevelParams.
1451        //
1452        // `checked_next_power_of_two` returns `None` if the next power
1453        // of two would overflow `usize`. Modern Rust's
1454        // `next_power_of_two` PANICS on overflow rather than returning
1455        // 0 (the panic message is generic and unhelpful), so use the
1456        // checked variant to surface the failure with a clear,
1457        // domain-specific error.
1458        let next_pow2 = max_window_size.checked_next_power_of_two().expect(
1459            "MatchGeneratorDriver::new: max_window_size too large for \
1460             next_power_of_two without overflow",
1461        );
1462        let window_log_init = next_pow2.trailing_zeros() as u8;
1463        Self {
1464            vec_pool: Vec::new(),
1465            storage: MatcherStorage::Simple(FastKernelMatcher::with_params(
1466                window_log_init,
1467                FAST_LEVEL_1_HASH_LOG,
1468                FAST_LEVEL_1_MLS,
1469                2, // upstream zstd default step_size (targetLength=0 → step=2)
1470            )),
1471            strategy_tag: super::strategy::StrategyTag::Fast,
1472            search: super::strategy::SearchMethod::Fast,
1473            parse: super::strategy::ParseMode::Greedy,
1474            #[cfg(test)]
1475            config_override: None,
1476            param_overrides: None,
1477            slice_size,
1478            base_slice_size: slice_size,
1479            // Report the ROUNDED-UP window size that the matcher
1480            // actually carries (via `window_log_init = log2(next_pow2)`
1481            // → matcher's `max_window_size = 1 << window_log_init =
1482            // next_pow2`). For non-power-of-two `slice_size *
1483            // max_slices_in_window` inputs, the unrounded value
1484            // would under-report the active backend's window until
1485            // the first `reset()` overwrites both sides from the
1486            // resolved LevelParams.
1487            reported_window_size: next_pow2,
1488            reset_size_log: None,
1489            reset_shape: None,
1490            dictionary_retained_budget: 0,
1491            source_size_hint: None,
1492            dictionary_size_hint: None,
1493            borrowed_pending: None,
1494            primed: None,
1495        }
1496    }
1497
1498    fn level_params(level: CompressionLevel, source_size: Option<u64>) -> LevelParams {
1499        resolve_level_params(level, source_size)
1500    }
1501
1502    /// Install the public-parameter per-knob overrides (#27) applied at
1503    /// the next [`Matcher::reset`]. `None` (or an all-`None` set) restores
1504    /// plain level-based geometry. Persists across resets until changed.
1505    pub(crate) fn set_param_overrides(
1506        &mut self,
1507        overrides: Option<super::parameters::ParamOverrides>,
1508    ) {
1509        self.param_overrides = overrides;
1510    }
1511
1512    /// Active backend family derived from the storage variant. Single
1513    /// source of truth — no separate runtime tag to drift against.
1514    pub(crate) fn active_backend(&self) -> super::strategy::BackendTag {
1515        self.storage.backend()
1516    }
1517
1518    /// Whether the borrowed (no-copy, in-place over-window) scan is
1519    /// implemented for the current backend + search configuration. The
1520    /// HashChain backend serves both the lazy CHAIN parser
1521    /// (`SearchMethod::HashChain`) and the BT/optimal parsers
1522    /// (`SearchMethod::BinaryTree`); only the lazy chain has a borrowed scan
1523    /// so far, so BT/optimal stay on the owned path.
1524    pub(crate) fn borrowed_supported(&self) -> bool {
1525        use super::strategy::{BackendTag, SearchMethod, StrategyTag};
1526        match self.active_backend() {
1527            BackendTag::Simple | BackendTag::Dfast | BackendTag::Row => true,
1528            // The HashChain backend covers two searches: the lazy CHAIN parser
1529            // (borrowed-capable) and the BINARY-TREE search (btlazy2 L13-15 +
1530            // optimal BtOpt/BtUltra/BtUltra2 L16-22). btlazy2's BT-tree borrowed
1531            // scan is byte-identical to owned (reads via live_history()), so it
1532            // takes the in-place path. The OPTIMAL parsers stay owned: their
1533            // cost-based DP is sensitive to candidate quality, and the borrowed
1534            // continuous-index scan yields slightly different (ratio-worse)
1535            // candidates than the owned evict+rehash scan — borrowed optimal
1536            // both diverged from owned and fell outside the ffi ratio bound.
1537            // Search-aware (not just strategy_tag) so optimal BT can never be
1538            // staged on the borrowed path even via an internal caller.
1539            BackendTag::HashChain => match self.search {
1540                SearchMethod::HashChain => true,
1541                SearchMethod::BinaryTree => matches!(self.strategy_tag, StrategyTag::Btlazy2),
1542                _ => false,
1543            },
1544        }
1545    }
1546
1547    /// Whether a DICTIONARY frame can take the borrowed (no input copy) path.
1548    /// Only the Simple (Fast) backend with the dictionary ATTACHED (not the
1549    /// copy/merge regime) has a borrowed dict scan — `start_matching_borrowed_dict`
1550    /// reads live matches from the borrowed input in place and dict matches
1551    /// from the committed dict prefix via the 2-segment counter. Every other
1552    /// backend, and copy-mode (large-input) dict frames, stay on the owned
1553    /// path. Checked AFTER priming, so `is_attached()` reflects the resolved
1554    /// attach-vs-copy decision.
1555    pub(crate) fn borrowed_dict_supported(&self) -> bool {
1556        matches!(
1557            &self.storage,
1558            MatcherStorage::Simple(m) if m.dict_is_attached()
1559        )
1560    }
1561
1562    fn simple_mut(&mut self) -> &mut FastKernelMatcher {
1563        match &mut self.storage {
1564            MatcherStorage::Simple(m) => m,
1565            _ => panic!("simple backend must be initialized by reset() before use"),
1566        }
1567    }
1568
1569    /// Reclaim the per-block input buffer that the Simple backend
1570    /// just spent inside `start_matching` / `skip_matching_with_hint`.
1571    ///
1572    /// `FastKernelMatcher::take_recycled_space` returns the cleared
1573    /// (capacity-retained) `Vec<u8>` from the last
1574    /// `extend_history_with_pending`. We push it onto `vec_pool`
1575    /// as-is (with `len = 0`); `get_next_space()` is responsible for
1576    /// resizing the buffer back to `slice_size` on its next pop. The
1577    /// pushed length is irrelevant — only the capacity matters, and
1578    /// `extend_history_with_pending` preserves it. Without this
1579    /// recycle path, the Simple backend would allocate a new
1580    /// `Vec<u8>` per block — a measurable hot-path cost when blocks
1581    /// are small (~128 KiB) and processed at hundreds of MiB/s.
1582    fn recycle_simple_space(&mut self) {
1583        if let Some(space) = self.simple_mut().take_recycled_space() {
1584            // `space` is already cleared (len = 0) by
1585            // `extend_history_with_pending`; capacity is retained.
1586            // Leaving `len = 0` here avoids the cost of zero-filling
1587            // the entire allocation — `get_next_space()` resizes the
1588            // popped buffer up to `slice_size` on demand, so the
1589            // length the pool holds is irrelevant. This matters most
1590            // after a small-source-size hint has shrunk `slice_size`
1591            // mid-frame: the recycled buffer can be much larger than
1592            // the current `slice_size`, and zero-filling 128 KiB+ on
1593            // every block would erase the perf win the recycle path
1594            // is meant to deliver.
1595            self.vec_pool.push(space);
1596        }
1597    }
1598
1599    /// Register a caller-owned input buffer as the Simple backend's
1600    /// borrowed one-shot match window. Only valid on the Simple (Fast)
1601    /// backend; the one-shot frame path gates on that before calling.
1602    ///
1603    /// # Safety
1604    /// Same contract as [`FastKernelMatcher::set_borrowed_window`]: the
1605    /// buffer must stay live and unmodified until the window is cleared,
1606    /// and must be cleared before the buffer is dropped or the matcher is
1607    /// reused for another frame.
1608    pub(crate) unsafe fn set_borrowed_window(&mut self, buffer: &[u8]) {
1609        // SAFETY: forwarded contract — caller upholds liveness/clear.
1610        match self.active_backend() {
1611            super::strategy::BackendTag::Simple => unsafe {
1612                self.simple_mut().set_borrowed_window(buffer)
1613            },
1614            super::strategy::BackendTag::Dfast => unsafe {
1615                self.dfast_matcher_mut().set_borrowed_window(buffer)
1616            },
1617            super::strategy::BackendTag::Row => unsafe {
1618                self.row_matcher_mut().set_borrowed_window(buffer)
1619            },
1620            super::strategy::BackendTag::HashChain => unsafe {
1621                self.hc_matcher_mut().set_borrowed_window(buffer)
1622            },
1623        }
1624    }
1625
1626    /// Clear the borrowed one-shot window, returning the active backend
1627    /// to the owned `history` path.
1628    pub(crate) fn clear_borrowed_window(&mut self) {
1629        match self.active_backend() {
1630            super::strategy::BackendTag::Simple => self.simple_mut().clear_borrowed_window(),
1631            super::strategy::BackendTag::Dfast => self.dfast_matcher_mut().clear_borrowed_window(),
1632            super::strategy::BackendTag::Row => self.row_matcher_mut().clear_borrowed_window(),
1633            super::strategy::BackendTag::HashChain => self.hc_matcher_mut().clear_borrowed_window(),
1634            #[allow(unreachable_patterns)]
1635            _ => {}
1636        }
1637        self.borrowed_pending = None;
1638    }
1639
1640    /// Stage the borrowed block range `[block_start, block_end)` for the
1641    /// NEXT `start_matching` / `skip_matching_with_hint`, which the
1642    /// borrowed Fast frame path uses in place of `commit_space`. While
1643    /// staged, those trait calls route to the Simple backend's borrowed
1644    /// scan/skip (consuming the stage) instead of the owned committed
1645    /// block. See [`Matcher::start_matching`] /
1646    /// [`Matcher::skip_matching_with_hint`] on this type.
1647    pub(crate) fn set_borrowed_block(&mut self, block_start: usize, block_end: usize) {
1648        assert!(
1649            self.borrowed_supported(),
1650            "borrowed block staging is not supported for the active backend/search config",
1651        );
1652        assert!(
1653            block_start <= block_end,
1654            "borrowed block range must satisfy start <= end (start={block_start} end={block_end})",
1655        );
1656        self.borrowed_pending = Some((block_start, block_end));
1657        // Make the range visible to `get_last_space()` immediately: the
1658        // emit pipeline reads `get_last_space().len()` in
1659        // `collect_block_parts` BEFORE `start_matching` consumes the
1660        // stage, so the staged block (not the whole borrowed window) must
1661        // be reported now to keep the literal-buffer reservation right.
1662        match self.active_backend() {
1663            super::strategy::BackendTag::Simple => self
1664                .simple_mut()
1665                .stage_borrowed_block(block_start, block_end),
1666            super::strategy::BackendTag::Dfast => self
1667                .dfast_matcher_mut()
1668                .stage_borrowed_block(block_start, block_end),
1669            super::strategy::BackendTag::Row => self
1670                .row_matcher_mut()
1671                .stage_borrowed_block(block_start, block_end),
1672            super::strategy::BackendTag::HashChain => self
1673                .hc_matcher_mut()
1674                .table
1675                .stage_borrowed_block(block_start, block_end),
1676        }
1677    }
1678
1679    #[cfg(test)]
1680    fn dfast_matcher(&self) -> &DfastMatchGenerator {
1681        match &self.storage {
1682            MatcherStorage::Dfast(m) => m,
1683            _ => panic!("dfast backend must be initialized by reset() before use"),
1684        }
1685    }
1686
1687    fn dfast_matcher_mut(&mut self) -> &mut DfastMatchGenerator {
1688        match &mut self.storage {
1689            MatcherStorage::Dfast(m) => m,
1690            _ => panic!("dfast backend must be initialized by reset() before use"),
1691        }
1692    }
1693
1694    #[cfg(test)]
1695    fn row_matcher(&self) -> &RowMatchGenerator {
1696        match &self.storage {
1697            MatcherStorage::Row(m) => m,
1698            _ => panic!("row backend must be initialized by reset() before use"),
1699        }
1700    }
1701
1702    fn row_matcher_mut(&mut self) -> &mut RowMatchGenerator {
1703        match &mut self.storage {
1704            MatcherStorage::Row(m) => m,
1705            _ => panic!("row backend must be initialized by reset() before use"),
1706        }
1707    }
1708
1709    #[cfg(test)]
1710    fn hc_matcher(&self) -> &HcMatchGenerator {
1711        match &self.storage {
1712            MatcherStorage::HashChain(m) => m,
1713            _ => panic!("hash chain backend must be initialized by reset() before use"),
1714        }
1715    }
1716
1717    fn hc_matcher_mut(&mut self) -> &mut HcMatchGenerator {
1718        match &mut self.storage {
1719            MatcherStorage::HashChain(m) => m,
1720            _ => panic!("hash chain backend must be initialized by reset() before use"),
1721        }
1722    }
1723
1724    /// Shrink the active backend's `max_window_size` by the bytes
1725    /// reclaimed from the dictionary-retention budget. Returns `true`
1726    /// iff any reclamation happened — the caller uses that as the
1727    /// gate for [`Self::trim_after_budget_retire`] (which is a no-op
1728    /// otherwise: with `max_window_size` unchanged the backend's
1729    /// `trim_to_window` cannot find anything to evict, so calling it
1730    /// just runs an extra `match` ladder + a single early-out check
1731    /// per slice commit).
1732    #[must_use]
1733    fn retire_dictionary_budget(&mut self, evicted_bytes: usize) -> bool {
1734        let reclaimed = evicted_bytes.min(self.dictionary_retained_budget);
1735        if reclaimed == 0 {
1736            return false;
1737        }
1738        self.dictionary_retained_budget -= reclaimed;
1739        match self.active_backend() {
1740            super::strategy::BackendTag::Simple => {
1741                let matcher = self.simple_mut();
1742                // `reclaimed` can exceed the CURRENT `max_window_size`: the
1743                // retained dict budget is tracked independently and the
1744                // window may already have been shrunk by a prior eviction,
1745                // so the floor at 0 is the correct clamp, not a masked bug.
1746                matcher.max_window_size = matcher.max_window_size.saturating_sub(reclaimed);
1747            }
1748            super::strategy::BackendTag::Dfast => {
1749                let matcher = self.dfast_matcher_mut();
1750                // `reclaimed` can exceed the CURRENT `max_window_size`: the
1751                // retained dict budget is tracked independently and the
1752                // window may already have been shrunk by a prior eviction,
1753                // so the floor at 0 is the correct clamp, not a masked bug.
1754                matcher.max_window_size = matcher.max_window_size.saturating_sub(reclaimed);
1755            }
1756            super::strategy::BackendTag::Row => {
1757                let matcher = self.row_matcher_mut();
1758                // `reclaimed` can exceed the CURRENT `max_window_size`: the
1759                // retained dict budget is tracked independently and the
1760                // window may already have been shrunk by a prior eviction,
1761                // so the floor at 0 is the correct clamp, not a masked bug.
1762                matcher.max_window_size = matcher.max_window_size.saturating_sub(reclaimed);
1763            }
1764            super::strategy::BackendTag::HashChain => {
1765                let matcher = self.hc_matcher_mut();
1766                // See the Simple arm: `reclaimed` may exceed the current
1767                // window, so saturating to 0 is the correct clamp.
1768                matcher.table.max_window_size =
1769                    matcher.table.max_window_size.saturating_sub(reclaimed);
1770            }
1771        }
1772        true
1773    }
1774
1775    fn trim_after_budget_retire(&mut self) {
1776        loop {
1777            let mut evicted_bytes = 0usize;
1778            match self.active_backend() {
1779                super::strategy::BackendTag::Simple => {
1780                    // FastKernelMatcher owns its history as a single
1781                    // flat `Vec<u8>` (upstream zstd's flat-buffer layout)
1782                    // rather than the legacy per-block `WindowEntry`
1783                    // stack. There are no per-block Vec allocations
1784                    // to recycle into `vec_pool` — `trim_to_window`
1785                    // drains the oldest bytes in-place and returns
1786                    // the count for the dictionary-budget loop's
1787                    // termination check.
1788                    let MatcherStorage::Simple(m) = &mut self.storage else {
1789                        unreachable!("active_backend() == Simple proven above");
1790                    };
1791                    evicted_bytes += m.trim_to_window();
1792                }
1793                super::strategy::BackendTag::Dfast => {
1794                    // Dfast doesn't retain input Vecs — `history` is the
1795                    // only byte store, so there is no per-block buffer
1796                    // to push back through a callback. Eviction byte
1797                    // count is derived from the `window_size` delta
1798                    // before/after; the Dfast variant of
1799                    // `trim_to_window` takes no closure, sidestepping
1800                    // an unused-`impl FnMut` monomorphization that
1801                    // would otherwise contractually never fire.
1802                    let dfast = self.dfast_matcher_mut();
1803                    let pre = dfast.window_size;
1804                    dfast.trim_to_window();
1805                    evicted_bytes += pre - dfast.window_size;
1806                }
1807                super::strategy::BackendTag::Row => {
1808                    // Row keeps bytes only in the contiguous `history` mirror
1809                    // (block buffers are returned to the pool per block in
1810                    // `add_data`), so derive the eviction count from the
1811                    // `window_size` delta, mirroring the Dfast / HashChain arms.
1812                    let row = self.row_matcher_mut();
1813                    let pre = row.window_size;
1814                    row.trim_to_window();
1815                    evicted_bytes += pre - row.window_size;
1816                }
1817                super::strategy::BackendTag::HashChain => {
1818                    // HC keeps bytes only in the contiguous `history` mirror
1819                    // (no per-block Vecs to recycle since the window<->history
1820                    // dedup), so derive the eviction count from the
1821                    // `window_size` delta, mirroring the Dfast arm above.
1822                    let table = &mut self.hc_matcher_mut().table;
1823                    let pre = table.window_size;
1824                    table.trim_to_window();
1825                    evicted_bytes += pre - table.window_size;
1826                }
1827            }
1828            if evicted_bytes == 0 {
1829                break;
1830            }
1831            // The loop's invariant is "the backend's previous
1832            // `max_window_size` shrink had downstream bytes left to
1833            // evict" — that's what `evicted_bytes != 0` proves at
1834            // this point. `dictionary_retained_budget` is NOT
1835            // guaranteed to be positive here: the outer
1836            // `retire_dictionary_budget` call may have already
1837            // drained it to zero by reclaiming the last retained
1838            // bytes, while the backend still has bytes above the
1839            // freshly-shrunk window cap waiting for this loop to
1840            // evict. The return value of the retire call below is
1841            // therefore intentionally discarded — the loop's
1842            // termination is driven by `evicted_bytes == 0`, not by
1843            // whether the budget has more bytes left to reclaim.
1844            let _ = self.retire_dictionary_budget(evicted_bytes);
1845        }
1846    }
1847
1848    /// ATTACH (`true`) vs COPY (`false`) decision for the dms-bearing HashChain
1849    /// backend (lazy hash-chain AND binary-tree/optimal levels), mirroring
1850    /// upstream `ZSTD_shouldAttachDict` and its per-strategy `attachDictSizeCutoffs`:
1851    /// a small / unknown source ATTACHES the dict as a separate dms (hash-chain
1852    /// dms for lazy, DUBT dms for BT); a large known source COPIES it into the
1853    /// live chain / tree. The cutoff is the lazy/lazy2 value for HC, the
1854    /// btlazy2/btopt value for Bt{Opt}, and the smaller btultra/btultra2 value for
1855    /// the deepest parses. Both `skip_matching_for_dictionary_priming` (which
1856    /// stages the dict) and `prime_with_dictionary` (which builds-or-drops the
1857    /// dms) read this so the two stay in lock-step.
1858    fn hc_dict_attach_mode(&self) -> bool {
1859        // Only the HashChain backend (lazy hash-chain + BT/optimal) routes here;
1860        // a non-HashChain storage has no dms decision, so default to attach.
1861        let MatcherStorage::HashChain(hc) = &self.storage else {
1862            return true;
1863        };
1864        let cutoff = if hc.table.uses_bt {
1865            match hc.strategy_tag {
1866                super::strategy::StrategyTag::BtUltra | super::strategy::StrategyTag::BtUltra2 => {
1867                    BT_ULTRA_ATTACH_DICT_CUTOFF_LOG
1868                }
1869                _ => BT_OPT_ATTACH_DICT_CUTOFF_LOG,
1870            }
1871        } else {
1872            HC_ATTACH_DICT_CUTOFF_LOG
1873        };
1874        self.reset_size_log.is_none_or(|log| log <= cutoff)
1875    }
1876
1877    fn skip_matching_for_dictionary_priming(&mut self) {
1878        match self.active_backend() {
1879            super::strategy::BackendTag::Simple => {
1880                // Upstream zstd `ZSTD_shouldAttachDict` mode selection for the Fast
1881                // strategy (cutoff 8 KB): small / unknown-size inputs ATTACH
1882                // (index dict positions into a SEPARATE immutable table; the
1883                // dual-probe 2-cursor `compress_block_fast_dict` then prefers
1884                // recent-input matches and falls back to the dict — the path
1885                // that wins small/unknown). Large known-size inputs COPY (prime
1886                // dict into the live table; the 4-cursor `compress_block_fast`
1887                // matches against it as window history — the path that already
1888                // matches/beats the upstream zstd on large corpora). The dispatch in
1889                // `start_matching` keys off `dict_table.is_some()`, which only
1890                // the attach path populates. See [`FAST_ATTACH_DICT_CUTOFF_LOG`].
1891                let attach = self
1892                    .reset_size_log
1893                    .is_none_or(|log| log <= FAST_ATTACH_DICT_CUTOFF_LOG);
1894                if attach {
1895                    self.simple_mut().skip_matching_for_dict_prime();
1896                } else {
1897                    self.simple_mut().skip_matching_with_hint(Some(false));
1898                }
1899                self.recycle_simple_space();
1900            }
1901            super::strategy::BackendTag::Dfast => {
1902                // Upstream zstd `ZSTD_dictMatchState` mode selection for dfast (cutoff
1903                // 16 KiB): small / unknown-size inputs ATTACH (build the
1904                // separate immutable dict long+short tables; the dual-probe
1905                // `start_matching_fast_loop` searches live + dict, the path that
1906                // avoids the per-frame dict re-prime that dominates small
1907                // `compress-dict`). Larger known-size inputs COPY (re-prime the
1908                // dict into the live tables via `skip_matching_dense`, where the
1909                // dense scan matches it as window history). `skip_matching_for_dict_attach`
1910                // self-gates on `use_fast_loop` (only fast-loop levels carry the
1911                // dual-probe; general-path levels fall back to the dense copy).
1912                let attach = self
1913                    .reset_size_log
1914                    .is_none_or(|log| log <= DFAST_ATTACH_DICT_CUTOFF_LOG);
1915                if attach {
1916                    self.dfast_matcher_mut().skip_matching_for_dict_attach();
1917                } else {
1918                    self.dfast_matcher_mut().invalidate_dict_cache();
1919                    self.dfast_matcher_mut().skip_matching_dense();
1920                }
1921            }
1922            super::strategy::BackendTag::Row => {
1923                // Upstream zstd `ZSTD_RowFindBestMatch` `dictMatchState`: small /
1924                // unknown-size inputs ATTACH (build the separate immutable dict
1925                // row index; the bounded dual-probe in `row_candidate_rl`
1926                // searches live + dict, avoiding the per-frame dict re-index),
1927                // larger known-size inputs COPY (dense re-prime into the live
1928                // rows).
1929                let attach = self
1930                    .reset_size_log
1931                    .is_none_or(|log| log <= ROW_ATTACH_DICT_CUTOFF_LOG);
1932                if attach {
1933                    self.row_matcher_mut().prime_dict_attach_current_block();
1934                } else {
1935                    self.row_matcher_mut().invalidate_dict_cache();
1936                    self.row_matcher_mut().skip_matching_with_hint(Some(false));
1937                }
1938            }
1939            super::strategy::BackendTag::HashChain => {
1940                // Lazy-HC AND BT/optimal both follow upstream zstd `ZSTD_shouldAttachDict`
1941                // per-strategy: ATTACH (a separate dms — hash-chain dms for lazy,
1942                // DUBT dms for BT) for small / unknown inputs, COPY (merge the dict
1943                // into the live chain/tree) for large known inputs. ATTACH keeps
1944                // the dict in history but out of the live structure via
1945                // `skip_matching_dict_bt` (the cursor advance is shared by both
1946                // arms); COPY routes through the normal `skip_matching` (its
1947                // `uses_bt` branch fills the live tree, the lazy branch the live
1948                // chain). The dms is built-or-dropped to match in
1949                // `prime_with_dictionary`.
1950                if self.hc_dict_attach_mode() {
1951                    self.hc_matcher_mut().table.skip_matching_dict_bt();
1952                } else {
1953                    self.hc_matcher_mut().skip_matching(Some(false));
1954                }
1955            }
1956        }
1957    }
1958}
1959
1960impl Matcher for MatchGeneratorDriver {
1961    fn supports_dictionary_priming(&self) -> bool {
1962        true
1963    }
1964
1965    fn set_source_size_hint(&mut self, size: u64) {
1966        self.source_size_hint = Some(size);
1967    }
1968
1969    fn set_dictionary_size_hint(&mut self, size: usize) {
1970        self.dictionary_size_hint = Some(size);
1971    }
1972
1973    /// Heap bytes this driver owns: the active backend's tables/history, the
1974    /// recycled input-buffer pool, and the primed-dictionary snapshot (a cloned
1975    /// backend kept for CDict-equivalent reuse). The inline struct itself is
1976    /// accounted by the owner's `size_of`.
1977    fn heap_size(&self) -> usize {
1978        let pool: usize = self.vec_pool.capacity() * core::mem::size_of::<Vec<u8>>()
1979            + self.vec_pool.iter().map(Vec::capacity).sum::<usize>();
1980        let snapshot = self
1981            .primed
1982            .as_ref()
1983            .map_or(0, |(storage, _, _)| storage.heap_size());
1984        pool + self.storage.heap_size() + snapshot
1985    }
1986
1987    fn clear_param_overrides(&mut self) {
1988        self.param_overrides = None;
1989    }
1990
1991    fn reset(&mut self, level: CompressionLevel) {
1992        let hint = self.source_size_hint.take();
1993        let dict_hint = self.dictionary_size_hint.take();
1994        // Snapshot the hint's normalized ceil-log bucket for the primed-snapshot
1995        // key and prime_with_dictionary's attach/copy mode decision (the hint is
1996        // consumed here, but priming happens just after reset). Storing the
1997        // bucket rather than the raw bytes means two hints that resolve to the
1998        // same matcher shape share one snapshot instead of each re-priming.
1999        self.reset_size_log = hint.map(source_size_ceil_log);
2000        let hinted = hint.is_some();
2001        #[cfg_attr(not(test), allow(unused_mut))]
2002        let mut params = Self::level_params(level, hint);
2003        // Test-only: apply a parse×search override so the matrix can be
2004        // exercised without editing `LEVEL_TABLE`. Mutating `params` here
2005        // (before `next_backend`) flows the override through storage
2006        // selection, `configure`, and the `self.search`/`self.parse`
2007        // writes uniformly. Consumed with `take()` so it is one-shot: the
2008        // synthetic pairing applies to exactly this `reset()`, and a later
2009        // reset on the same driver falls back to the level's real config.
2010        #[cfg(test)]
2011        if let Some((search, parse)) = self.config_override.take() {
2012            params.search = search;
2013            params.lazy_depth = parse.lazy_depth();
2014            // The matrix sweep can pair a level with a backend its native
2015            // row doesn't populate (e.g. greedy L5, which carries only `row`,
2016            // run on HashChain). Synthesize a default config for the
2017            // overridden backend so its `configure` arm has something to read.
2018            use super::strategy::SearchMethod;
2019            match search {
2020                SearchMethod::Fast => {
2021                    params.fast.get_or_insert(FAST_L1);
2022                }
2023                SearchMethod::DoubleFast => {
2024                    params.dfast.get_or_insert(DFAST_L3);
2025                }
2026                SearchMethod::RowHash => {
2027                    params.row.get_or_insert(ROW_CONFIG);
2028                }
2029                SearchMethod::HashChain | SearchMethod::BinaryTree => {
2030                    params.hc.get_or_insert(HC_CONFIG);
2031                }
2032            }
2033        }
2034        // Public-parameter overrides (#27): apply the per-knob set on top
2035        // of the level-resolved params. A strategy override re-routes the
2036        // backend, so this must precede `next_backend` selection. The
2037        // all-`None` case is skipped so default level geometry stays
2038        // byte-identical to plain level-based compression.
2039        if let Some(ov) = self.param_overrides
2040            && !ov.is_empty()
2041        {
2042            apply_param_overrides(&mut params, &ov);
2043            // `Self::level_params(level, hint)` applied the source-size cap
2044            // for the LEVEL's native backend. If a strategy override moved
2045            // the frame onto a different backend, `apply_param_overrides`
2046            // synthesized that backend's DEFAULT config (FAST_L1 /
2047            // HC_OVERRIDE_DEFAULT) with full-size table logs AFTER that cap
2048            // ran. Re-apply the hint cap so a tiny hinted frame doesn't
2049            // allocate the new backend's full-size tables. An explicit
2050            // `window_log` override is the user's hard request and must
2051            // survive the re-cap, so restore it afterwards.
2052            if let Some(hint_size) = hint {
2053                params = adjust_params_for_source_size(params, hint_size);
2054                if let Some(window_log) = ov.window_log {
2055                    params.window_log = window_log;
2056                }
2057            }
2058        }
2059        // Dictionary-driven table sizing — parity with upstream zstd `ZSTD_createCDict`
2060        // (`ZSTD_getCParams_internal(level, UNKNOWN, dictSize, ZSTD_cpm_createCDict)`
2061        // → `ZSTD_adjustCParams_internal`). A loaded dictionary supplies the
2062        // long-distance matches, so upstream zstd sizes the prepared match-finder tables
2063        // to the DICTIONARY (assuming a `minSrcSize` source), not the live
2064        // window: it downsizes `hashLog`/`chainLog` toward the dict-and-window
2065        // log while leaving the frame's eviction `window_log` source-derived so
2066        // the dictionary bytes stay referenceable (`ZSTD_resetCCtx_byCopyingCDict`
2067        // copies the small CDict tables but keeps the source window). We apply
2068        // the same downsizing to the level's own hc geometry and cap (min) so a
2069        // dict never inflates the level tables. Only the binary-tree / hash-chain
2070        // backend reads `hc.{hash,chain}_log`; Simple/Dfast/Row derive their
2071        // widths from the source window in their `reset` arms.
2072        // A zero-length dictionary is "no dictionary": running the CDict sizing
2073        // path for `Some(0)` is not a no-op — `cdict_table_logs(.., 0)` still
2074        // collapses the HC/BT tables toward the 513-byte upstream zstd tier via
2075        // `DICT_MIN_SRC_SIZE`, tanking ratio/perf on the next frame. Priming
2076        // already treats empty content as empty, so skip the downsizing here too.
2077        if let Some(dict_size) = dict_hint.filter(|&size| size > 0) {
2078            // Derive the dict-tier geometry from the level's FULL (un-source-capped)
2079            // hc widths. `Self::level_params(level, hint)` already source-capped
2080            // `params.hc`; feeding those capped widths into `cdict_table_logs` and
2081            // then `.min()`-ing would double-cap, so on a small hinted source with a
2082            // large dictionary the prepared tables collapse below what the dict needs
2083            // — defeating the `ZSTD_createCDict` geometry this mirrors. Take the
2084            // un-hinted base widths instead and assign the result directly:
2085            // `cdict_table_logs` only ever downsizes, so it never exceeds the base
2086            // level geometry, while the eviction `window_log` stays source-derived so
2087            // the dictionary bytes remain referenceable. Active public-parameter
2088            // overrides (#27) are applied to the base too, so a strategy override
2089            // that routes onto HashChain/BinaryTree still gets dict-tier sizing and
2090            // explicit hash/chain overrides feed through as the geometry ceiling.
2091            let mut base_params = Self::level_params(level, None);
2092            if let Some(ov) = self.param_overrides
2093                && !ov.is_empty()
2094            {
2095                apply_param_overrides(&mut base_params, &ov);
2096            }
2097            if let (Some(hc), Some(base_hc)) = (params.hc.as_mut(), base_params.hc) {
2098                let uses_bt = matches!(
2099                    params.strategy_tag,
2100                    super::strategy::StrategyTag::Btlazy2
2101                        | super::strategy::StrategyTag::BtOpt
2102                        | super::strategy::StrategyTag::BtUltra
2103                        | super::strategy::StrategyTag::BtUltra2
2104                );
2105                let (dict_hash_log, dict_chain_log) = cdict_table_logs(
2106                    params.window_log,
2107                    base_hc.hash_log,
2108                    base_hc.chain_log,
2109                    uses_bt,
2110                    dict_size,
2111                );
2112                hc.hash_log = dict_hash_log;
2113                hc.chain_log = dict_chain_log;
2114            }
2115        }
2116        // upstream zstd `ZSTD_resolveRowMatchFinderMode` (zstd_compress.c:238-245):
2117        // the row matchfinder is used for greedy/lazy/lazy2 ONLY when
2118        // `windowLog > 14`; at or below that upstream runs the hash-chain
2119        // matcher (`ZSTD_HcFindBestMatch`). We previously hardcoded the Row
2120        // backend for these strategies regardless of window, sending every
2121        // small-window frame (hinted floor = windowLog 14, e.g. the small-4k/10k
2122        // fixtures) through Row where upstream uses HC. Match it: fall back to
2123        // the hash-chain matcher (lazy/greedy parse via `lazy_depth`) when the
2124        // resolved window is <= 14. The HC config is synthesised from the
2125        // level's RowConfig (HC and Row share the same cParams; only the
2126        // matchfinder differs) — `hash_log` / `chain_log` are
2127        // clamped to the (<= 14) window inside the HashChain reset arm, so the
2128        // nominal width here only sets the clamp ceiling.
2129        if params.search == super::strategy::SearchMethod::RowHash && params.window_log <= 14 {
2130            let row = params
2131                .row
2132                .expect("a RowHash level row must carry a RowConfig");
2133            params.search = super::strategy::SearchMethod::HashChain;
2134            // For a dict-bearing frame, downsize the synthesised HC logs to the
2135            // dictionary's content tier via `cdict_table_logs` (the same
2136            // correction the native HC dict-prime path applies above), so a dict
2137            // much smaller than the window doesn't prime a needlessly sparse
2138            // table. Row-finder levels are never BinaryTree, so `uses_bt = false`.
2139            //
2140            // Feed `cdict_table_logs` the UN-hinted base Row width, not the
2141            // resolved `row.hash_bits`: the latter is already source-capped on a
2142            // hinted reset (the `row_cap = table_log + 1` clamp), so passing it
2143            // here would double-cap exactly as the native HC dict path warns
2144            // above — a small hinted source with a large dictionary would
2145            // collapse the prepared table below what the dict needs.
2146            // `cdict_table_logs` only ever downsizes, so deriving the ceiling
2147            // from the un-hinted base (plus active public overrides) keeps the
2148            // dict-tier geometry intact. No source hint => `row.hash_bits` is
2149            // already the level's full width, so reuse it directly.
2150            let row_cdict_hash_bits = match dict_hint.filter(|&size| size > 0) {
2151                Some(_) => {
2152                    let mut base_params = Self::level_params(level, None);
2153                    if let Some(ov) = self.param_overrides
2154                        && !ov.is_empty()
2155                    {
2156                        apply_param_overrides(&mut base_params, &ov);
2157                    }
2158                    base_params
2159                        .row
2160                        .map_or(row.hash_bits, |base_row| base_row.hash_bits)
2161                }
2162                None => row.hash_bits,
2163            };
2164            // Row-backed levels carry only `hash_bits`; the HC chain table they
2165            // fall back to follows the upstream zstd cParams relationship `chainLog =
2166            // hashLog - 1` for every Row level (L6 c18 h19 .. L12 c22 h23, see
2167            // the ROW_L* tables). Synthesise the chain width as `hash_bits - 1`
2168            // so the dict path doesn't leave the chain table one bit too wide
2169            // (cdict_table_logs only downsizes, so passing the full hash width
2170            // for both would keep a 2x-too-large chain table on dict frames).
2171            // Raw `- 1` is underflow-safe: `hash_bits` is either a predefined
2172            // ROW_L* width (>= 19) or a public `hash_log` override, and the
2173            // override is range-validated to `ZSTD_HASHLOG_MIN = 6` at the
2174            // parameter API, so the value is always >= 6 here.
2175            //
2176            // A public `chain_log` override (#27) is dropped by the RowHash
2177            // override arm (Row has no chain table), but once this frame falls
2178            // back to HC the chain table is live and must honour it — mirror
2179            // the native HC dict path, which feeds the override-applied
2180            // `base_hc.chain_log` into `cdict_table_logs`. Use the explicit
2181            // override (also API-validated to ZSTD_CHAINLOG_MIN = 6) when set,
2182            // else the upstream zstd `hashLog - 1` relationship.
2183            let explicit_chain_log = self
2184                .param_overrides
2185                .filter(|ov| !ov.is_empty())
2186                .and_then(|ov| ov.chain_log)
2187                .map(|chain_log| chain_log as usize);
2188            let row_cdict_chain_bits = explicit_chain_log.unwrap_or(row_cdict_hash_bits - 1);
2189            let (mut hash_log, mut chain_log) = match dict_hint.filter(|&size| size > 0) {
2190                Some(dict_size) => cdict_table_logs(
2191                    params.window_log,
2192                    row_cdict_hash_bits,
2193                    row_cdict_chain_bits,
2194                    false,
2195                    dict_size,
2196                ),
2197                None => (
2198                    row.hash_bits,
2199                    explicit_chain_log.unwrap_or(row.hash_bits - 1),
2200                ),
2201            };
2202            // No-dict path: the HashChain reset arm only clamps the logs to the
2203            // window when `hinted`, but a public `window_log` override can lower
2204            // this level to <= 14 with no source hint — clamp the level's full
2205            // Row `hash_bits` to the window here too (upstream zstd `ZSTD_adjustCParams`:
2206            // hashLog <= windowLog + 1, chainLog <= windowLog) so a 16 KiB window
2207            // doesn't allocate Row-sized HC tables.
2208            if dict_hint.filter(|&size| size > 0).is_none() {
2209                let wlog = params.window_log as usize;
2210                hash_log = hash_log.min(wlog + 1);
2211                chain_log = chain_log.min(wlog);
2212            }
2213            params.hc = Some(HcConfig {
2214                hash_log,
2215                chain_log,
2216                search_depth: row.search_depth,
2217                target_len: row.target_len,
2218                search_mls: 4,
2219            });
2220            params.row = None;
2221        }
2222        let next_backend = params.backend();
2223        let max_window_size = 1usize << params.window_log;
2224        self.dictionary_retained_budget = 0;
2225        // Drop any frame-local borrowed staging so it can't leak across a
2226        // reset and misroute the next start/skip into borrowed dispatch.
2227        self.borrowed_pending = None;
2228        if self.active_backend() != next_backend {
2229            // Drain the outgoing backend's allocations into the shared
2230            // pool. The `match &mut self.storage { ... }` block runs to
2231            // completion before the assignment below replaces the
2232            // variant, so the inner state we just drained is dropped
2233            // with the old variant.
2234            match &mut self.storage {
2235                MatcherStorage::Simple(_m) => {
2236                    // FastKernelMatcher owns a flat Vec<u8> history
2237                    // and a Vec<u32> hash table — both drop with the
2238                    // variant assignment below, no per-block buffers
2239                    // to recycle into the driver pools. The
2240                    // assignment-replace path collapses to a noop
2241                    // pre-pass for this backend.
2242                }
2243                MatcherStorage::Dfast(m) => {
2244                    // Drop the long / short hash table allocations
2245                    // before calling `m.reset`. Without this prepass,
2246                    // `DfastMatchGenerator::reset` would `fill` both
2247                    // tables with `DFAST_EMPTY_SLOT` sentinels — wasted
2248                    // work given the next assignment to `self.storage`
2249                    // is about to drop `m` entirely. `reset` itself
2250                    // short-circuits on `if !self.short_hash.is_empty()`,
2251                    // so handing it an empty `Vec` skips the fill loop.
2252                    // Mirrors the pre-drain pattern in the HashChain
2253                    // arm below (and serves the same peak-memory
2254                    // purpose: release the table-allocation footprint
2255                    // before constructing the replacement variant).
2256                    m.short_hash = Vec::new();
2257                    m.long_hash = Vec::new();
2258                    m.reset();
2259                }
2260                MatcherStorage::Row(m) => {
2261                    m.row_heads = Vec::new();
2262                    m.row_positions = Vec::new();
2263                    m.row_tags = Vec::new();
2264                    m.reset();
2265                }
2266                MatcherStorage::HashChain(m) => {
2267                    // Release oversized tables when switching away from
2268                    // HashChain so Best's larger allocations don't persist.
2269                    // hash3_table must be released alongside the other
2270                    // two: BtUltra2's `1 << HC3_HASH_LOG` entries would
2271                    // otherwise stay pinned across the backend switch,
2272                    // even though no future caller of this backend will
2273                    // touch them.
2274                    m.table.hash_table = Vec::new();
2275                    m.table.chain_table = Vec::new();
2276                    m.table.hash3_table = Vec::new();
2277                    let vec_pool = &mut self.vec_pool;
2278                    m.reset(|mut data| {
2279                        data.resize(data.capacity(), 0);
2280                        vec_pool.push(data);
2281                    });
2282                }
2283            }
2284            // Swap in a fresh variant for the new backend. The previous
2285            // `storage` is dropped here.
2286            self.storage = match next_backend {
2287                super::strategy::BackendTag::Simple => {
2288                    // Per-level Fast cParams from resolve_level_params:
2289                    // Level(1) gets (hash_log=14, mls=7); Level(-7..=-1)
2290                    // get upstream zstd row-0 (hash_log=13, mls=7); Fastest /
2291                    // Uncompressed keep (hash_log=14, mls=6). See
2292                    // resolve_level_params for rationale.
2293                    let fast = params.fast.expect("Fast level row carries a FastConfig");
2294                    MatcherStorage::Simple(FastKernelMatcher::with_params(
2295                        params.window_log,
2296                        fast.hash_log,
2297                        fast.mls,
2298                        fast.step_size,
2299                    ))
2300                }
2301                super::strategy::BackendTag::Dfast => {
2302                    MatcherStorage::Dfast(DfastMatchGenerator::new(max_window_size))
2303                }
2304                super::strategy::BackendTag::Row => {
2305                    MatcherStorage::Row(RowMatchGenerator::new(max_window_size))
2306                }
2307                super::strategy::BackendTag::HashChain => {
2308                    MatcherStorage::HashChain(HcMatchGenerator::new(max_window_size))
2309                }
2310            };
2311        }
2312
2313        // Single source of truth: `LevelParams::strategy_tag` is the
2314        // authoritative mapping from `CompressionLevel` to strategy.
2315        // `storage.backend()` derives the parse family from the variant,
2316        // so there is no separate runtime tag that could drift against
2317        // `LEVEL_TABLE`.
2318        self.strategy_tag = params.strategy_tag;
2319        self.search = params.search;
2320        self.parse = params.parse();
2321        self.slice_size = self.base_slice_size.min(max_window_size);
2322        self.reported_window_size = max_window_size;
2323        let strategy_tag = self.strategy_tag;
2324        // Source-proportional table window for the backends whose hash-table
2325        // widths are recomputed here (Dfast / Row). Like the HC / Fast caps
2326        // in `adjust_params_for_source_size`, this sizes the internal tables
2327        // from the RAW source log (not the wire `window_log` floor) so a
2328        // small frame zeroes a small table; it never exceeds the real window.
2329        let table_window_size = match hint {
2330            Some(h) => {
2331                let raw_log = source_size_ceil_log(h);
2332                // Clamp the shift below the pointer width before `1usize <<`:
2333                // an oversized hint (>= 2^63 + 1, and on 32-bit usize any hint
2334                // >= 2^32) drives `raw_log` to 64 / >= 32, and the shift would
2335                // overflow (panic in debug, wrap to 0 in release) before the
2336                // `.min(max_window_size)` cap below could bound it. The min cap
2337                // still provides the real semantic window bound.
2338                let shift = raw_log.max(MIN_WINDOW_LOG).min(usize::BITS as u8 - 1);
2339                (1usize << shift).min(max_window_size)
2340            }
2341            None => max_window_size,
2342        };
2343        // The hint-dependent hash-table width the active backend applies, for
2344        // the primed-snapshot key. Dfast/Row compute it from `table_window_size`
2345        // below; HC/Fast leave it `0` because their widths live in `params`
2346        // (`hc.{hash,chain}_log` / `fast_hash_log`) — already part of the key.
2347        let mut resolved_table_bits: usize = 0;
2348        match &mut self.storage {
2349            MatcherStorage::Simple(m) => {
2350                // Per-level Fast cParams threaded from
2351                // resolve_level_params (see Simple-backend swap
2352                // arm above for the (level → params) mapping).
2353                let fast = params.fast.expect("Fast level row carries a FastConfig");
2354                // Same attach/copy split the dict-prime dispatch applies
2355                // below (`prime_with_dictionary`): only attach-mode dict
2356                // frames may keep the main table across the reset via an
2357                // epoch advance — copy-mode and no-dict frames must memset
2358                // it back to bias 0 for the raw-slice kernels.
2359                // `Some(0)` is "no dictionary" (the dict-sizing path above
2360                // filters it the same way): an empty dict primes nothing, so
2361                // an epoch-advance reset would preserve stale attach state
2362                // instead of clearing it.
2363                let dict_attach_epoch = matches!(dict_hint, Some(size) if size > 0)
2364                    && self
2365                        .reset_size_log
2366                        .is_none_or(|log| log <= FAST_ATTACH_DICT_CUTOFF_LOG);
2367                // Copy-mode dictionary frame whose primed snapshot matches
2368                // this exact resolved shape: `restore_primed_dictionary`
2369                // (called right after this reset; the caller gates the
2370                // restore on the same size bucket and the restore re-checks
2371                // the same key) will `clone_from` the snapshot over this
2372                // matcher, replacing the table contents and bias wholesale —
2373                // the reset's full-table memset would be thrown away. The
2374                // key components mirror `reset_shape` below: Simple leaves
2375                // `resolved_table_bits` 0, never carries an LDM override,
2376                // and `fast_attach` is false in copy mode by construction.
2377                let table_overwritten_by_restore = matches!(dict_hint, Some(size) if size > 0)
2378                    && !dict_attach_epoch
2379                    && self.primed.as_ref().is_some_and(|(_, _, captured)| {
2380                        *captured
2381                            == PrimedKey {
2382                                level,
2383                                params,
2384                                table_bits: 0,
2385                                fast_attach: false,
2386                                ldm: None,
2387                            }
2388                    });
2389                m.reset(
2390                    params.window_log,
2391                    fast.hash_log,
2392                    fast.mls,
2393                    fast.step_size,
2394                    dict_attach_epoch,
2395                    table_overwritten_by_restore,
2396                );
2397            }
2398            MatcherStorage::Dfast(dfast) => {
2399                dfast.max_window_size = max_window_size;
2400                let dcfg = params
2401                    .dfast
2402                    .expect("Dfast level row must carry a DfastConfig");
2403                // Upstream zstd `cParams.hashLog`/`chainLog`, capped by the
2404                // source-size window when hinted so tiny inputs don't
2405                // over-allocate.
2406                let long_bits = if hinted {
2407                    dfast_hash_bits_for_window(table_window_size).min(dcfg.long_hash_log as usize)
2408                } else {
2409                    dcfg.long_hash_log as usize
2410                };
2411                let short_bits = if hinted {
2412                    dfast_hash_bits_for_window(table_window_size).min(dcfg.short_hash_log as usize)
2413                } else {
2414                    dcfg.short_hash_log as usize
2415                };
2416                resolved_table_bits = long_bits;
2417                dfast.set_hash_bits(long_bits, short_bits);
2418                // Dfast holds no per-block input Vecs (history owns the
2419                // bytes and `add_data` returns each Vec eagerly), so
2420                // `reset` takes no `reuse_space` callback.
2421                dfast.reset();
2422            }
2423            MatcherStorage::Row(row) => {
2424                row.max_window_size = max_window_size;
2425                row.lazy_depth = params.lazy_depth;
2426                let mut row_cfg = params.row.expect("Row level row carries a RowConfig");
2427                if hinted {
2428                    // Clamp the configured hash width by the hinted window
2429                    // (upstream zstd `ZSTD_adjustCParams` caps hashLog by windowLog) —
2430                    // `min`, not replace, so an explicit `hash_log` param
2431                    // override (`row_cfg.hash_bits`) survives the hinted path
2432                    // instead of being overwritten by the window value.
2433                    //
2434                    // Clamp BEFORE `configure` so the backend sees ONE width
2435                    // per frame. Configuring with the unclamped level width
2436                    // and then re-clamping made `row_hash_log` oscillate on
2437                    // every hinted frame, and each width change clears the
2438                    // row tables — `ensure_tables` then re-filled all three
2439                    // every frame in a reused compressor.
2440                    row_cfg.hash_bits = row_cfg
2441                        .hash_bits
2442                        .min(row_hash_bits_for_window(table_window_size));
2443                }
2444                row.configure(row_cfg);
2445                // Key the primed snapshot on the width the backend ACTUALLY
2446                // applied (`set_hash_bits` clamps the request): recording the
2447                // request — or the 0 default on the unhinted path — keys
2448                // identical table geometries apart and forces needless
2449                // dictionary re-primes.
2450                resolved_table_bits = row.hash_bits();
2451                row.reset();
2452            }
2453            MatcherStorage::HashChain(hc) => {
2454                hc.table.max_window_size = max_window_size;
2455                hc.hc.lazy_depth = params.lazy_depth;
2456                let mut hc_cfg = params.hc.expect("HashChain level row carries an HcConfig");
2457                // Cap the hash / chain table logs by the hinted window so a small
2458                // input doesn't allocate the full level's tables (the upstream zstd
2459                // `ZSTD_adjustCParams_internal` clamp: `hashLog <= windowLog + 1`,
2460                // and `cycleLog <= windowLog` — `cycleLog == chainLog` for the HC
2461                // finder, `chainLog - 1` for the BT pair table, so `chainLog <=
2462                // windowLog` (+1 for BT)). Ratio-neutral: a hinted window of
2463                // `2^wlog` bytes holds at most `2^wlog` positions, so the slots
2464                // beyond that are never populated — capping only sheds unused
2465                // allocation. Was the source of L10-lazy peak-alloc ~2.15x the
2466                // upstream zstd on a 1 MiB input. Only applied when hinted; an
2467                // unknown-size stream keeps the full level tables.
2468                // Skip for dict-bearing frames: their `hc_cfg.{hash,chain}_log`
2469                // were already sized to the dictionary content tier via
2470                // `cdict_table_logs` (the dict supplies the long-distance
2471                // matches, so upstream `ZSTD_createCDict` sizes the prepared
2472                // tables to the dict, not the source window). Re-applying the
2473                // source-window cap here would collapse those dict-tier logs
2474                // back to the small hinted source — the same double-cap the
2475                // synthesis sites avoid by using the un-hinted base width.
2476                if hinted && !matches!(dict_hint, Some(size) if size > 0) {
2477                    let wlog = hc_hash_bits_for_window(table_window_size);
2478                    let uses_bt = matches!(
2479                        strategy_tag,
2480                        super::strategy::StrategyTag::Btlazy2
2481                            | super::strategy::StrategyTag::BtOpt
2482                            | super::strategy::StrategyTag::BtUltra
2483                            | super::strategy::StrategyTag::BtUltra2
2484                    );
2485                    hc_cfg.hash_log = hc_cfg.hash_log.min(wlog + 1);
2486                    hc_cfg.chain_log = hc_cfg.chain_log.min(if uses_bt { wlog + 1 } else { wlog });
2487                }
2488                hc.configure(hc_cfg, strategy_tag, params.window_log);
2489                let vec_pool = &mut self.vec_pool;
2490                hc.reset(|mut data| {
2491                    data.resize(data.capacity(), 0);
2492                    vec_pool.push(data);
2493                });
2494                // When the source size is known, pre-size the history mirror to
2495                // the expected total (dictionary + payload) so per-block growth
2496                // does not overshoot via Vec capacity doubling (upstream zstd sizes its
2497                // window buffer exactly). Dominates peak once the match-finder
2498                // tables are dictionary-tier-small. Unhinted streams skip this
2499                // and keep doubling growth.
2500                if let Some(src) = hint {
2501                    // `src` is a u64 hint and may be the u64::MAX "unknown
2502                    // size" sentinel, which truncates under `as usize` on
2503                    // 32-bit targets and overflows when the dict hint is
2504                    // added. Saturate the source size, then saturate the
2505                    // dict-hint addition; `reserve_history` applies the
2506                    // tighter window ceiling to the result.
2507                    let src_hint = usize::try_from(src).unwrap_or(usize::MAX);
2508                    let expected = src_hint.saturating_add(dict_hint.unwrap_or(0));
2509                    hc.table.reserve_history(expected);
2510                }
2511            }
2512        }
2513        // LDM wiring (#27): attach (or clear) the long-distance-match
2514        // producer on the optimal (BT) backend. LDM is the only
2515        // back-reference path that crosses the regular window, so it
2516        // only has a home on the `BtMatcher`; non-BT strategies drop the
2517        // producer. Built AFTER `hc.reset()` because `BtMatcher::reset`
2518        // clears an existing producer's table but does not null the
2519        // slot — installing here gives the new frame a fresh producer.
2520        #[cfg(feature = "hash")]
2521        if let MatcherStorage::HashChain(hc) = &mut self.storage {
2522            let producer = self
2523                .param_overrides
2524                .as_ref()
2525                .and_then(|ov| ov.ldm)
2526                .map(|ldm_ov| {
2527                    let strategy_ord = ldm_strategy_ordinal(params.strategy_tag, params.lazy_depth);
2528                    // Seed the caller-pinned knobs, then run the upstream zstd
2529                    // derivation over the seed so the remaining (zero)
2530                    // fields are filled with cross-field consistency
2531                    // (e.g. `hash_rate_log = window_log - hash_log`).
2532                    // Clobbering after `adjust_for` would break that and
2533                    // hand the producer an inconsistent set.
2534                    let seed = super::ldm::params::LdmParams {
2535                        window_log: params.window_log as u32,
2536                        hash_log: ldm_ov.hash_log.unwrap_or(0),
2537                        hash_rate_log: ldm_ov.hash_rate_log.unwrap_or(0),
2538                        min_match_length: ldm_ov.min_match.unwrap_or(0),
2539                        bucket_size_log: ldm_ov.bucket_size_log.unwrap_or(0),
2540                    };
2541                    super::ldm::LdmProducer::new(seed.derive(strategy_ord))
2542                });
2543            hc.set_ldm_producer(producer);
2544        }
2545        // Record the resolved matcher shape for the primed-snapshot key. Captured
2546        // here (post-resolution, after the test-only param override) so the key
2547        // reflects exactly the geometry the restored `storage` must match. The
2548        // Fast attach-vs-copy mode is part of the shape ONLY for the Simple
2549        // backend (it decides the distinct dict-table shape that backend builds).
2550        // Dfast/Row/HashChain have their OWN attach/copy regimes, but this bit
2551        // models only the Fast table split; those backends are keyed by the
2552        // resolved matcher geometry instead, so folding the Fast bit into their
2553        // key would over-key identical resolved shapes. When it applies it
2554        // matches the decision `prime_with_dictionary` makes from the same
2555        // `reset_size_log`.
2556        let fast_attach = matches!(next_backend, super::strategy::BackendTag::Simple)
2557            && self
2558                .reset_size_log
2559                .is_none_or(|log| log <= FAST_ATTACH_DICT_CUTOFF_LOG);
2560        // The LDM override is part of the snapshot identity ONLY on the
2561        // optimal (BinaryTree) path: that is the only backend whose cloned
2562        // `storage` carries a `BtMatcher::ldm_producer`. On Fast / Dfast /
2563        // Row and lazy-HashChain resets the producer slot does not exist,
2564        // so folding the override there would over-key the snapshot and
2565        // force needless re-primes when LDM is toggled. Gated like
2566        // `fast_attach` (a key bit only participates where it changes the
2567        // cloned matcher shape).
2568        let active_ldm = if matches!(params.search, super::strategy::SearchMethod::BinaryTree) {
2569            self.param_overrides.and_then(|ov| ov.ldm)
2570        } else {
2571            None
2572        };
2573        self.reset_shape = Some((params, resolved_table_bits, fast_attach, active_ldm));
2574    }
2575
2576    fn prime_with_dictionary(&mut self, dict_content: &[u8], offset_hist: [u32; 3]) {
2577        match self.active_backend() {
2578            super::strategy::BackendTag::Simple => {
2579                // Routes through prime_offset_history so BOTH
2580                // offset_hist (wire encoder) and rep[0..2] (kernel)
2581                // are updated atomically. Without this, the two
2582                // tracks drift after dict priming — kernel emits
2583                // repcode matches against stale FAST_INITIAL_REP
2584                // while the wire encoder uses the primed history,
2585                // producing divergent wire encoding (Copilot review
2586                // #15 on #216).
2587                self.simple_mut().prime_offset_history(offset_hist);
2588            }
2589            super::strategy::BackendTag::Dfast => {
2590                self.dfast_matcher_mut().offset_hist = offset_hist
2591            }
2592            super::strategy::BackendTag::Row => self.row_matcher_mut().offset_hist = offset_hist,
2593            super::strategy::BackendTag::HashChain => {
2594                let matcher = self.hc_matcher_mut();
2595                matcher.table.offset_hist = offset_hist;
2596                matcher.table.mark_dictionary_primed();
2597            }
2598        }
2599
2600        if dict_content.is_empty() {
2601            return;
2602        }
2603
2604        // Dictionary bytes should stay addressable until produced frame output
2605        // itself exceeds the live window size. We bump `max_window_size`
2606        // by the dictionary length so the eviction band keeps the
2607        // primed bytes in `history`.
2608        //
2609        // Cap: `with_params`/`reset` enforce `window_log <= 30` so the
2610        // eviction band `2 * max_window_size` stays below `u32::MAX`
2611        // with headroom for one MAX_BLOCK_SIZE pending block — the
2612        // kernel asserts `data.len() <= u32::MAX`. A large enough
2613        // dictionary could otherwise push `max_window_size` past
2614        // that ceiling via the `saturating_add` below and silently
2615        // re-introduce the same overflow the `window_log` cap was
2616        // designed to prevent. Clamp the post-priming size so the
2617        // doubled-band-plus-block invariant survives.
2618        const MAX_PRIMED_WINDOW_SIZE: usize =
2619            (u32::MAX as usize - crate::common::MAX_BLOCK_SIZE as usize) / 2;
2620
2621        // `requested_dict_budget` is what the caller asked for;
2622        // `base_max_window_size` snapshots the pre-priming cap so we
2623        // can compute how much window the cap actually GRANTED below.
2624        // The cap may clip the requested growth, in which case the
2625        // bookkeeping (`dictionary_retained_budget` retire path) must
2626        // track only the granted portion — otherwise
2627        // `retire_dictionary_budget()` would later reclaim more than
2628        // was actually added and shrink the matcher below its real
2629        // base window (and `cap = 2 * max_window_size` would shrink
2630        // with it, risking under-allocation on subsequent commits).
2631        // The `granted_retained_budget` calculation further below is
2632        // the load-bearing piece — see its block-level comment for
2633        // the post-clip / post-uncommitted-tail math.
2634        let requested_dict_budget = dict_content.len();
2635        let base_max_window_size = match self.active_backend() {
2636            super::strategy::BackendTag::Simple => self.simple_mut().max_window_size,
2637            super::strategy::BackendTag::Dfast => self.dfast_matcher_mut().max_window_size,
2638            super::strategy::BackendTag::Row => self.row_matcher_mut().max_window_size,
2639            super::strategy::BackendTag::HashChain => self.hc_matcher_mut().table.max_window_size,
2640        };
2641        match self.active_backend() {
2642            super::strategy::BackendTag::Simple => {
2643                let matcher = self.simple_mut();
2644                matcher.max_window_size = matcher
2645                    .max_window_size
2646                    .saturating_add(requested_dict_budget)
2647                    .min(MAX_PRIMED_WINDOW_SIZE);
2648            }
2649            super::strategy::BackendTag::Dfast => {
2650                let matcher = self.dfast_matcher_mut();
2651                matcher.max_window_size = matcher
2652                    .max_window_size
2653                    .saturating_add(requested_dict_budget)
2654                    .min(MAX_PRIMED_WINDOW_SIZE);
2655            }
2656            super::strategy::BackendTag::Row => {
2657                let matcher = self.row_matcher_mut();
2658                matcher.max_window_size = matcher
2659                    .max_window_size
2660                    .saturating_add(requested_dict_budget)
2661                    .min(MAX_PRIMED_WINDOW_SIZE);
2662            }
2663            super::strategy::BackendTag::HashChain => {
2664                let matcher = self.hc_matcher_mut();
2665                matcher.table.max_window_size = matcher
2666                    .table
2667                    .max_window_size
2668                    .saturating_add(requested_dict_budget)
2669                    .min(MAX_PRIMED_WINDOW_SIZE);
2670            }
2671        }
2672
2673        let mut start = 0usize;
2674        let mut committed_dict_budget = 0usize;
2675        // insert_position needs 4 bytes of lookahead for hashing;
2676        // backfill_boundary_positions re-visits tail positions once the
2677        // next slice extends history, but cannot hash <4 byte fragments.
2678        let min_primed_tail = match self.active_backend() {
2679            super::strategy::BackendTag::Simple => MIN_MATCH_LEN,
2680            super::strategy::BackendTag::Dfast
2681            | super::strategy::BackendTag::Row
2682            | super::strategy::BackendTag::HashChain => 4,
2683        };
2684        while start < dict_content.len() {
2685            let end = (start + self.slice_size).min(dict_content.len());
2686            if end - start < min_primed_tail {
2687                break;
2688            }
2689            // Stage the dict chunk WITHOUT `get_next_space`'s
2690            // `resize(slice_size, 0)` zero-fill: that memsets a full
2691            // block-sized buffer (up to ~128 KiB) every frame only to have it
2692            // `clear()`-ed and overwritten by the dict bytes on the very next
2693            // lines — pure waste (measured ~10% of the small dict encode).
2694            // Reuse a pooled buffer's capacity if one is free (the prime/skip
2695            // cycle recycles them back), else allocate exactly the chunk.
2696            // Mirrors upstream zstd, which references the CDict content rather
2697            // than zero-filling a fresh window per frame.
2698            let mut space = self.vec_pool.pop().unwrap_or_default();
2699            space.clear();
2700            space.extend_from_slice(&dict_content[start..end]);
2701            self.commit_space(space);
2702            self.skip_matching_for_dictionary_priming();
2703            committed_dict_budget += end - start;
2704            start = end;
2705        }
2706
2707        // Derive `granted_retained_budget` directly from the two real
2708        // bounds — bytes actually committed and bytes the cap allows
2709        // — instead of doing a cap-clip pass followed by an
2710        // uncommitted-tail subtract. Previous shape double-discounted
2711        // when the cap clipped: clip lost `(requested - allowed)`,
2712        // then tail-subtract lost ANOTHER `(requested - committed)`,
2713        // leaving `max_window_size` shy of the dictionary that was
2714        // actually retained (e.g. cap=900, committed=998, uncommitted=2
2715        // landed at granted=898 instead of the correct 900).
2716        let capped_retained_budget = MAX_PRIMED_WINDOW_SIZE.saturating_sub(base_max_window_size);
2717        let granted_retained_budget = committed_dict_budget.min(capped_retained_budget);
2718        let final_max_window_size = base_max_window_size.saturating_add(granted_retained_budget);
2719        match self.active_backend() {
2720            super::strategy::BackendTag::Simple => {
2721                self.simple_mut().max_window_size = final_max_window_size;
2722            }
2723            super::strategy::BackendTag::Dfast => {
2724                self.dfast_matcher_mut().max_window_size = final_max_window_size;
2725            }
2726            super::strategy::BackendTag::Row => {
2727                self.row_matcher_mut().max_window_size = final_max_window_size;
2728            }
2729            super::strategy::BackendTag::HashChain => {
2730                self.hc_matcher_mut().table.max_window_size = final_max_window_size;
2731            }
2732        }
2733        if granted_retained_budget > 0 {
2734            self.dictionary_retained_budget = self
2735                .dictionary_retained_budget
2736                .saturating_add(granted_retained_budget);
2737        }
2738        if self.active_backend() == super::strategy::BackendTag::HashChain {
2739            // Recompute the lazy-HC attach decision made per-chunk in
2740            // `skip_matching_for_dictionary_priming` (stable across the prime —
2741            // `reset_size_log` does not change here).
2742            //
2743            // The HC attach/copy mode is deliberately NOT folded into `PrimedKey`
2744            // (unlike Fast `fast_attach`). Fast attach builds a separate dict
2745            // table whose dimensions differ from the copy-mode live table, so a
2746            // cross-mode restore would install mismatched table geometry and the
2747            // encoder could search past the frame window (undecodable). The two
2748            // HC modes share identical window geometry: `max_window_size` and the
2749            // dictionary limit are both set ABOVE this branch (the same value in
2750            // either mode), and the live chain table dimensions come from the
2751            // resolved `params` the key already pins. The modes differ only in
2752            // WHERE the committed dict lives — a single-link `dms` (attach) vs
2753            // merged into the live chain (copy) — both producing valid matches at
2754            // in-window offsets. Upstream zstd makes the same observation: attach
2755            // (`ZSTD_resetCCtx_byAttachingCDict`) and copy
2756            // (`ZSTD_resetCCtx_byCopyingCDict`) both keep the caller's
2757            // `windowLog`; the choice is a memory/speed trade-off, not a wire
2758            // contract. So restoring an attach snapshot where this frame would
2759            // have copied (or vice versa) yields a decodable frame that may only
2760            // differ in which matches are found (ratio) — algorithmic freedom, not
2761            // a defect. Keying on the mode would instead force a re-prime across
2762            // the cutoff, re-adding the per-frame cost this snapshot path removes.
2763            //
2764            // In practice the public reuse path (`compress_independent_frame`)
2765            // only ever captures AND restores the COPY-mode snapshot — capture is
2766            // gated on the above-cutoff source size, so a restored frame always
2767            // matches the captured mode. `hc_dict_snapshot_reuse_roundtrips` pins
2768            // that same-mode reuse decodes; the driver-level cross-mode restore is
2769            // accepted (not refused) per
2770            // `primed_snapshot_fast_attach_does_not_over_key_non_simple_backends`.
2771            let attach = self.hc_dict_attach_mode();
2772            let table = &mut self.hc_matcher_mut().table;
2773            table.set_dictionary_limit_from_primed_bytes(committed_dict_budget);
2774            // Build the dictMatchState over the committed dict (front of history)
2775            // so `find_best_match` dual-probes it with its own compare budget —
2776            // but ONLY in ATTACH mode. BT/optimal attach → DUBT dms; lazy-HC
2777            // attach → single-link hash-chain dms. COPY mode (large known source,
2778            // both BT and lazy-HC) already merged the dict into the live tree /
2779            // chain in `skip_matching_for_dictionary_priming`, so it carries no
2780            // separate dms — drop any stale one.
2781            if !attach {
2782                table.dms.invalidate();
2783            } else if table.uses_bt {
2784                table.prime_dms_bt(committed_dict_budget);
2785            } else {
2786                table.prime_dms_hc(committed_dict_budget);
2787            }
2788        }
2789        // CDict-equivalent: now that every dict chunk is indexed, mark the
2790        // Fast-backend dict table primed so the next frame's re-prime reuses
2791        // it (skips the re-hash) while still re-committing the dict bytes to
2792        // history. No-op when the attach path built no table (copy mode or a
2793        // sub-8-byte dict) — `mark_dict_primed` self-guards on table presence.
2794        match self.active_backend() {
2795            super::strategy::BackendTag::Simple => self.simple_mut().mark_dict_primed(),
2796            super::strategy::BackendTag::Dfast => self.dfast_matcher_mut().mark_dict_primed(),
2797            super::strategy::BackendTag::Row => self.row_matcher_mut().mark_dict_primed(),
2798            _ => {}
2799        }
2800    }
2801
2802    fn restore_primed_dictionary(&mut self, level: super::CompressionLevel) -> bool {
2803        // Only the (storage, dictionary_retained_budget) pair is what
2804        // `prime_with_dictionary` writes; restoring them reproduces the
2805        // post-prime state exactly. Gated on the FULL resolved key (level + the
2806        // resolved `LevelParams` + the active backend's table width), not just
2807        // the level: `reset` resolves the hint into a window/table geometry, so a
2808        // same-level snapshot taken at a hint that resolved to a different shape
2809        // carries a `storage.max_window_size` / table dimensions that no longer
2810        // match this reset. Restoring it would let the encoder search past the
2811        // frame header's window (an undecodable match), so on a key mismatch we
2812        // refuse and the caller re-primes.
2813        let Some((params, table_bits, fast_attach, ldm)) = self.reset_shape else {
2814            return false;
2815        };
2816        let key = PrimedKey {
2817            level,
2818            params,
2819            table_bits,
2820            fast_attach,
2821            ldm,
2822        };
2823        let Some((snapshot, budget, captured_key)) = &self.primed else {
2824            return false;
2825        };
2826        if *captured_key != key {
2827            return false;
2828        }
2829        let budget = *budget;
2830        match (&mut self.storage, snapshot) {
2831            // Same-variant Fast restore: copy the snapshot into the retained
2832            // live storage. `clone_from` reuses the history / hash-table /
2833            // dict-table buffers, so this is the upstream zstd CDict table-copy
2834            // regime's cost (pure copies) instead of a full per-frame
2835            // allocation + copy + drop cycle.
2836            (MatcherStorage::Simple(live), MatcherStorage::Simple(snap)) => {
2837                live.clone_from(snap);
2838            }
2839            // Same-variant HC lazy/greedy restore (non-BT): the snapshot keeps
2840            // the full primed hash/chain tables (capture's non-BT full clone),
2841            // so `clone_from` reuses the live history/hash/chain/dms buffers in
2842            // place — upstream zstd reuses the CDict tables rather than reallocating
2843            // them. This is the per-frame allocate+copy+drop that dominated
2844            // small `compress-dict` HC frames (5-7x vs C). BT (`uses_bt`)
2845            // snapshots drop their live tables, so they stay on the realloc
2846            // path below.
2847            (MatcherStorage::HashChain(live), MatcherStorage::HashChain(snap))
2848                if !snap.table.uses_bt =>
2849            {
2850                live.table.clone_from(&snap.table);
2851                live.hc.clone_from(&snap.hc);
2852                live.strategy_tag = snap.strategy_tag;
2853                // backend is `HcBackend::Hc` (zero-sized) for non-BT levels;
2854                // the live one is already correct for this resolved key.
2855            }
2856            (live, snapshot_storage) => {
2857                let mut storage = snapshot_storage.clone();
2858                // This arm handles the binary-tree backend. In ATTACH mode the
2859                // snapshot was stored WITHOUT its live hash / chain / hash3
2860                // tables (they hold no dictionary entries — the dict lives in
2861                // `dms` + history; see `capture_primed_dictionary`), so
2862                // `ensure_tables` re-allocates them zeroed to the snapshot's
2863                // geometry, exactly reproducing the post-prime state (all
2864                // `HC_EMPTY`). In COPY mode the snapshot retained its FULL live
2865                // tree (the dict was merged into it, no `dms`), so the tables are
2866                // already present at the right length and `ensure_tables` — which
2867                // only allocates on a length mismatch — leaves them untouched.
2868                // Either way this is a full storage replace, so no stale
2869                // live-table entry from a prior frame can survive.
2870                if let MatcherStorage::HashChain(hc) = &mut storage {
2871                    hc.table.ensure_tables();
2872                }
2873                // The snapshot does not retain the LDM producer (it holds no
2874                // dict state; see `capture_primed_dictionary`). Carry over the
2875                // frame's freshly-reset producer — built this frame by `reset`
2876                // with the same params the snapshot key pins, and empty (no
2877                // input processed yet), so it is equivalent to the producer
2878                // the snapshot was captured with.
2879                #[cfg(feature = "hash")]
2880                {
2881                    let fresh_ldm = if let MatcherStorage::HashChain(hc) = live {
2882                        hc.take_ldm_producer()
2883                    } else {
2884                        None
2885                    };
2886                    if let MatcherStorage::HashChain(hc) = &mut storage {
2887                        hc.set_ldm_producer(fresh_ldm);
2888                    }
2889                }
2890                *live = storage;
2891            }
2892        }
2893        self.dictionary_retained_budget = budget;
2894        true
2895    }
2896
2897    fn capture_primed_dictionary(&mut self, level: super::CompressionLevel) {
2898        // No resolved shape means `reset` has not run for this frame — nothing
2899        // valid to key a snapshot on, so skip the capture.
2900        let Some((params, table_bits, fast_attach, ldm)) = self.reset_shape else {
2901            return;
2902        };
2903        let key = PrimedKey {
2904            level,
2905            params,
2906            table_bits,
2907            fast_attach,
2908            ldm,
2909        };
2910        // CDict-equivalent retained state. A binary-tree level in ATTACH mode
2911        // decouples the dictionary into `dms` (the upstream zstd `dictMatchState`); its
2912        // live hash / chain / hash3 tables carry NO dict entries
2913        // (`skip_matching_dict_bt` keeps the dict out of the live tree), so they
2914        // are pure zeros. Storing them in the snapshot wastes the full table
2915        // footprint (a second window-tier table set resident for the whole
2916        // compress). Instead, move the live tables OUT of the working storage,
2917        // clone only the dict-state (history + `dms` + window/offset/dict-limit),
2918        // then move the live tables back — the snapshot keeps just what upstream zstd's
2919        // CDict keeps, and `restore_primed_dictionary` re-allocates the zeroed
2920        // live tables. Every other case keeps the dict reachable through the live
2921        // structure, so the snapshot must retain the full tables (full clone):
2922        // lazy-HC attach (it DOES prime a hash-chain `dms`, but the live chain is
2923        // still the search structure, so the tables must travel) and COPY mode for
2924        // BOTH BT and lazy-HC (`dms` invalidated, dict merged into the live tree /
2925        // chain). `uses_bt && dms.is_primed()` is therefore the exact "decoupled"
2926        // signal — true only for the BT attach prime; lazy-HC attach primes `dms`
2927        // too but is intentionally NOT decoupled.
2928        let bt_decoupled = matches!(
2929            &self.storage,
2930            MatcherStorage::HashChain(hc) if hc.table.uses_bt && hc.table.dms.is_primed()
2931        );
2932        if bt_decoupled {
2933            let MatcherStorage::HashChain(hc) = &mut self.storage else {
2934                unreachable!("bt_decoupled implies HashChain storage");
2935            };
2936            let hash_table = core::mem::take(&mut hc.table.hash_table);
2937            let chain_table = core::mem::take(&mut hc.table.chain_table);
2938            let hash3_table = core::mem::take(&mut hc.table.hash3_table);
2939            // The LDM producer carries no dictionary state (LDM is not
2940            // dict-primed; its hash table is empty at capture), so it is not
2941            // retained either — `restore` reinstates the frame's freshly-reset
2942            // producer. Take it out so the clone does not duplicate its table.
2943            #[cfg(feature = "hash")]
2944            let ldm_producer = hc.take_ldm_producer();
2945            // Clone the dict-state-only storage (live tables now empty Vecs,
2946            // LDM producer detached).
2947            let snapshot = self.storage.clone();
2948            // Move the live tables (and LDM producer) back into the working storage.
2949            let MatcherStorage::HashChain(hc) = &mut self.storage else {
2950                unreachable!("storage variant is stable across the take/put");
2951            };
2952            hc.table.hash_table = hash_table;
2953            hc.table.chain_table = chain_table;
2954            hc.table.hash3_table = hash3_table;
2955            #[cfg(feature = "hash")]
2956            hc.set_ldm_producer(ldm_producer);
2957            self.primed = Some((snapshot, self.dictionary_retained_budget, key));
2958        } else {
2959            self.primed = Some((self.storage.clone(), self.dictionary_retained_budget, key));
2960        }
2961    }
2962
2963    fn invalidate_primed_dictionary(&mut self) {
2964        self.primed = None;
2965        // Drop the Fast-backend CDict-equivalent table cache too: it is keyed
2966        // to the dictionary being removed / replaced. Left in place, the next
2967        // same-params `reset` would retain it and the kernel would probe a
2968        // dict region whose bytes are no longer re-committed to history.
2969        match self.active_backend() {
2970            super::strategy::BackendTag::Simple => self.simple_mut().invalidate_dict_cache(),
2971            super::strategy::BackendTag::Dfast => self.dfast_matcher_mut().invalidate_dict_cache(),
2972            // Row keeps its attach index across frames (like Simple/Dfast),
2973            // so a dictionary swap must drop its cached dict rows too;
2974            // otherwise the next small/unknown-size frame reuses stale
2975            // attach state through `prime_dict_attach_current_block`.
2976            super::strategy::BackendTag::Row => self.row_matcher_mut().invalidate_dict_cache(),
2977            // The BT dms tree is keyed to the dict bytes; `prime_dms_bt`
2978            // skips the rebuild while its shape matches, so a swapped
2979            // dictionary of the same length would otherwise keep serving the
2980            // OLD dictionary's tree.
2981            super::strategy::BackendTag::HashChain => {
2982                self.hc_matcher_mut().table.dms.invalidate();
2983            }
2984        }
2985    }
2986
2987    fn seed_dictionary_entropy(
2988        &mut self,
2989        huff: Option<&crate::huff0::huff0_encoder::HuffmanTable>,
2990        ll: Option<&crate::fse::fse_encoder::FSETable>,
2991        ml: Option<&crate::fse::fse_encoder::FSETable>,
2992        of: Option<&crate::fse::fse_encoder::FSETable>,
2993    ) {
2994        if self.active_backend() == super::strategy::BackendTag::HashChain {
2995            self.hc_matcher_mut()
2996                .seed_dictionary_entropy(huff, ll, ml, of);
2997        }
2998    }
2999
3000    fn window_size(&self) -> u64 {
3001        self.reported_window_size as u64
3002    }
3003
3004    fn get_next_space(&mut self) -> Vec<u8> {
3005        if let Some(mut space) = self.vec_pool.pop() {
3006            if space.len() > self.slice_size {
3007                space.truncate(self.slice_size);
3008            }
3009            if space.len() < self.slice_size {
3010                space.resize(self.slice_size, 0);
3011            }
3012            return space;
3013        }
3014        alloc::vec![0; self.slice_size]
3015    }
3016
3017    fn get_last_space(&mut self) -> &[u8] {
3018        match &self.storage {
3019            MatcherStorage::Simple(m) => m.last_committed_space(),
3020            MatcherStorage::Dfast(m) => m.get_last_space(),
3021            MatcherStorage::Row(m) => m.get_last_space(),
3022            MatcherStorage::HashChain(m) => m.table.get_last_space(),
3023        }
3024    }
3025
3026    fn commit_space(&mut self, space: Vec<u8>) {
3027        let mut evicted_bytes = 0usize;
3028        // Split borrows manually so the `add_data` closures can write
3029        // into `vec_pool` while the backend itself holds an exclusive
3030        // borrow via `storage`. (Suffix-store recycling went away
3031        // with the legacy `MatchGenerator`; the FastKernelMatcher
3032        // arm below has no pool interaction.)
3033        let vec_pool = &mut self.vec_pool;
3034        match &mut self.storage {
3035            MatcherStorage::Simple(m) => {
3036                // FastKernelMatcher owns its history as a single
3037                // flat Vec<u8> and the hash table as a Vec<u32> —
3038                // neither recycles into the driver-side pools. The
3039                // eager pre-commit eviction inside
3040                // `FastKernelMatcher::accept_data` drops bytes when
3041                // accepting this block would push history past 2×
3042                // max_window_size; that delta is what feeds
3043                // `evicted_bytes` here via the `pre / post`
3044                // history-length comparison.
3045                let pre = m.history_len_for_eviction_accounting();
3046                m.accept_data(space);
3047                let post = m.history_len_for_eviction_accounting();
3048                // `accept_data` performs eager pre-commit window
3049                // eviction (so this `pre - post` delta correctly
3050                // feeds the dictionary-budget retire flow). See
3051                // `FastKernelMatcher::accept_data` for the
3052                // commit-time-visibility rationale (closes #216
3053                // CodeRabbit review #5 / Copilot review #1: without
3054                // eager eviction, the delta was always 0 and the
3055                // dict budget never retired, leaving max_window_size
3056                // inflated post-dict-prime → matcher could emit
3057                // offsets exceeding the frame header's window).
3058                evicted_bytes += pre.saturating_sub(post);
3059            }
3060            MatcherStorage::Dfast(m) => {
3061                // Dfast's `add_data` callback receives the INPUT
3062                // `Vec<u8>` for pool recycling (Dfast stores its
3063                // bytes in the contiguous `history` buffer, not in
3064                // per-block Vecs — there is no per-block buffer to
3065                // pop off and hand back). Counting `data.len()` as
3066                // evicted bytes would conflate "new bytes ingested"
3067                // with "old bytes evicted from window"; the two
3068                // happen to coincide when the previous window was
3069                // saturated and the new input fills it 1:1, but
3070                // diverge when the eviction pop-loop drops blocks
3071                // of a different size than the incoming input. The
3072                // `dictionary_retained_budget` retire decision
3073                // downstream then gets driven by inflated eviction
3074                // counts and shrinks `max_window_size` prematurely.
3075                //
3076                // Derive the real eviction delta from `window_size`
3077                // before/after the call. The pop loop inside
3078                // `add_data` decrements `window_size` by each
3079                // evicted block length and then the final
3080                // `extend_from_slice + push_back` adds `space_len`,
3081                // so `evicted = pre + space_len - post`.
3082                let pre = m.window_size;
3083                let space_len = space.len();
3084                m.add_data(space, |data| {
3085                    // Same per-block recycle as the HashChain arm: push
3086                    // the spent input buffer back as-is rather than
3087                    // zero-filling to capacity. `add_data` mirrors the
3088                    // bytes into `history` and calls this every block, so
3089                    // capacity-wide zeroing would be hot-path waste;
3090                    // `get_next_space` zeroes at most `slice_size` bytes
3091                    // when it later reuses the buffer.
3092                    vec_pool.push(data);
3093                });
3094                // Plain `+` (the `saturating_sub` floors at 0): `pre` + one
3095                // block are byte counts bounded by the window, no overflow.
3096                evicted_bytes += (pre + space_len).saturating_sub(m.window_size);
3097            }
3098            MatcherStorage::Row(m) => {
3099                // RowMatchGenerator::add_data recycles the *input* buffer
3100                // through this callback every commit (its bytes are mirrored
3101                // into `history`), not the evicted chunks. Derive the eviction
3102                // delta from `window_size` before/after — `evicted = pre +
3103                // space_len - post` — exactly like the Simple / HashChain arms.
3104                // Counting the callback argument as evicted would charge the
3105                // whole committed block as evicted and prematurely retire
3106                // dictionary budget on a window that evicts nothing.
3107                let pre = m.window_size;
3108                let space_len = space.len();
3109                m.add_data(space, |data| {
3110                    // Recycle the spent buffer as-is; `add_data` runs this for
3111                    // every committed block, so zero-filling to capacity here
3112                    // would be hot-path waste (`get_next_space` zeroes at most
3113                    // `slice_size` on reuse).
3114                    vec_pool.push(data);
3115                });
3116                // Plain `+` (the `saturating_sub` floors at 0): `pre` + one
3117                // block are byte counts bounded by the window, no overflow.
3118                evicted_bytes += (pre + space_len).saturating_sub(m.window_size);
3119            }
3120            MatcherStorage::HashChain(m) => {
3121                // MatchTable::add_data now recycles the *incoming* buffer
3122                // through `reuse_space` (its bytes are copied into the
3123                // contiguous `history` mirror), so the callback no longer
3124                // reports evicted chunks. Derive the eviction delta from
3125                // `window_size` before/after, exactly like the Simple arm:
3126                // `evicted = pre + space_len - post`.
3127                let pre = m.table.window_size;
3128                let space_len = space.len();
3129                m.table.add_data(space, |data| {
3130                    // Recycle the spent input buffer to the pool as-is.
3131                    // `add_data` runs this callback for every committed
3132                    // block (the bytes are mirrored into `history`), so
3133                    // growing the buffer to its full capacity here would
3134                    // zero the whole allocation on the hot path.
3135                    // `get_next_space` resizes a popped buffer to
3136                    // `slice_size` on demand, touching at most
3137                    // `slice_size` bytes — never the larger capacity the
3138                    // pool retains.
3139                    vec_pool.push(data);
3140                });
3141                // Plain `+` (the `saturating_sub` floors at 0): byte counts
3142                // bounded by the window, no overflow.
3143                evicted_bytes += (pre + space_len).saturating_sub(m.table.window_size);
3144            }
3145        }
3146        // Gate the second backend trim pass on actual budget
3147        // reclamation. Without it, every slice commit on the
3148        // no-dictionary / no-eviction path (the common case) would
3149        // run a backend `match` ladder + `trim_to_window` early-out
3150        // for no reason — `trim_after_budget_retire` only does
3151        // meaningful work when `retire_dictionary_budget` shrank
3152        // `max_window_size` enough to make the backend's
3153        // `window_size > max_window_size` invariant trigger
3154        // eviction.
3155        if self.retire_dictionary_budget(evicted_bytes) {
3156            self.trim_after_budget_retire();
3157        }
3158    }
3159
3160    fn start_matching(&mut self, mut handle_sequence: impl for<'a> FnMut(Sequence<'a>)) {
3161        use super::strategy::{self, StrategyTag};
3162        // Borrowed one-shot Fast path: if the frame driver staged a
3163        // block range via `set_borrowed_block`, scan it in place against
3164        // the borrowed window instead of the owned committed block. Only
3165        // the Simple backend is instrumented (the gate guarantees it),
3166        // and the stage is consumed so the next block re-stages.
3167        if let Some((block_start, block_end)) = self.borrowed_pending.take() {
3168            match self.active_backend() {
3169                super::strategy::BackendTag::Simple => {
3170                    let m = self.simple_mut();
3171                    if m.dict_is_attached() {
3172                        // Dict-attach borrowed scan: live matches read the
3173                        // borrowed input in place, dict matches read the
3174                        // committed dict prefix via the 2-segment counter.
3175                        m.start_matching_borrowed_dict(
3176                            block_start,
3177                            block_end,
3178                            &mut handle_sequence,
3179                        );
3180                    } else {
3181                        m.start_matching_borrowed(block_start, block_end, &mut handle_sequence);
3182                    }
3183                }
3184                super::strategy::BackendTag::Dfast => self
3185                    .dfast_matcher_mut()
3186                    .start_matching_borrowed(block_start, block_end, &mut handle_sequence),
3187                super::strategy::BackendTag::Row => {
3188                    // Same greedy/lazy parse split as the owned RowHash arm.
3189                    let greedy = self.parse == super::strategy::ParseMode::Greedy;
3190                    self.row_matcher_mut().start_matching_borrowed(
3191                        block_start,
3192                        block_end,
3193                        greedy,
3194                        &mut handle_sequence,
3195                    );
3196                }
3197                super::strategy::BackendTag::HashChain => match self.search {
3198                    super::strategy::SearchMethod::HashChain => self
3199                        .hc_matcher_mut()
3200                        .start_matching_lazy_borrowed(block_start, block_end, &mut handle_sequence),
3201                    super::strategy::SearchMethod::BinaryTree => {
3202                        // Run the SAME BT dispatch as the owned BinaryTree arm
3203                        // below — every BT body reads its range via
3204                        // current_block_range() and bytes via live_history()
3205                        // (borrowed-aware), so the staged block is scanned in
3206                        // place. The table was already staged by
3207                        // `set_borrowed_block` (the HashChain arm at the top of
3208                        // this file calls `table.stage_borrowed_block` with the
3209                        // same range, and `borrowed_pending` is set only there),
3210                        // so no re-stage is needed here.
3211                        // Only btlazy2 reaches the borrowed BinaryTree scan:
3212                        // `borrowed_supported()` keeps the optimal parsers
3213                        // (BtOpt/BtUltra/BtUltra2) on the owned path, and
3214                        // `set_borrowed_block` asserts that predicate before any
3215                        // range is staged, so an optimal strategy_tag can never
3216                        // arrive here.
3217                        match self.strategy_tag {
3218                            StrategyTag::Btlazy2 => self
3219                                .hc_matcher_mut()
3220                                .start_matching_btlazy2(&mut handle_sequence),
3221                            other => unreachable!(
3222                                "borrowed BinaryTree scan is only supported for Btlazy2, got {other:?}"
3223                            ),
3224                        }
3225                    }
3226                    other => {
3227                        unreachable!("HashChain backend with unexpected search {other:?}")
3228                    }
3229                },
3230            }
3231            return;
3232        }
3233        // Decoupled parse×search dispatch (fires once per block). The
3234        // search axis (`self.search`) picks the candidate-finding backend;
3235        // the parse axis (greedy vs lazy depth) is carried by the
3236        // backend's runtime `lazy_depth`, set per level at `reset()`.
3237        // The two are independent, so any parse can run on any search
3238        // backend. The `BinaryTree` arm still selects the opt `Strategy`
3239        // ZST off `strategy_tag` so `compress_block::<S>` keeps its
3240        // const-folded optimal-parser monomorphisation.
3241        use super::strategy::SearchMethod;
3242        match self.search {
3243            SearchMethod::Fast => {
3244                self.simple_mut().start_matching(&mut handle_sequence);
3245                self.recycle_simple_space();
3246            }
3247            SearchMethod::DoubleFast => {
3248                self.dfast_matcher_mut()
3249                    .start_matching(&mut handle_sequence);
3250            }
3251            SearchMethod::RowHash => {
3252                // Greedy parse (depth 0) = upstream zstd-greedy entry (default
3253                // `ip + 1` start, greedy repcode commit); lazy / lazy2 use
3254                // the `pick_lazy_match` lookahead entry (reads `lazy_depth`).
3255                // Both bare entries dispatch on `row_log` internally into the
3256                // const-`ROW_LOG` hot loop (upstream zstd per-rowLog variant table).
3257                let greedy = self.parse == super::strategy::ParseMode::Greedy;
3258                let row = self.row_matcher_mut();
3259                if greedy {
3260                    row.start_matching_greedy(&mut handle_sequence);
3261                } else {
3262                    row.start_matching(&mut handle_sequence);
3263                }
3264            }
3265            SearchMethod::HashChain => {
3266                // Greedy/lazy/lazy2 all flow through the lazy parser; it
3267                // reads `hc.lazy_depth` (0 = greedy commit).
3268                self.hc_matcher_mut()
3269                    .start_matching_lazy(&mut handle_sequence);
3270            }
3271            SearchMethod::BinaryTree => match self.strategy_tag {
3272                StrategyTag::Btlazy2 => self
3273                    .hc_matcher_mut()
3274                    .start_matching_btlazy2(&mut handle_sequence),
3275                StrategyTag::BtOpt => self.compress_block::<strategy::BtOpt>(&mut handle_sequence),
3276                StrategyTag::BtUltra => {
3277                    self.compress_block::<strategy::BtUltra>(&mut handle_sequence)
3278                }
3279                StrategyTag::BtUltra2 => {
3280                    self.compress_block::<strategy::BtUltra2>(&mut handle_sequence)
3281                }
3282                _ => unreachable!(
3283                    "SearchMethod::BinaryTree requires a BT strategy tag (Btlazy2/BtOpt/BtUltra/BtUltra2)"
3284                ),
3285            },
3286        }
3287    }
3288
3289    fn skip_matching(&mut self) {
3290        self.skip_matching_with_hint(None);
3291    }
3292
3293    fn skip_matching_with_hint(&mut self, incompressible_hint: Option<bool>) {
3294        // Borrowed one-shot Fast path: a staged block range routes to the
3295        // borrowed skip (records the range for `get_last_space`, primes
3296        // hashes on the dict-priming hint) with no owned-history append
3297        // and nothing to recycle. Stage is consumed.
3298        if let Some((block_start, block_end)) = self.borrowed_pending.take() {
3299            match self.active_backend() {
3300                super::strategy::BackendTag::Simple => self.simple_mut().skip_matching_borrowed(
3301                    block_start,
3302                    block_end,
3303                    incompressible_hint,
3304                ),
3305                super::strategy::BackendTag::Dfast => self
3306                    .dfast_matcher_mut()
3307                    .skip_matching_borrowed(block_start, block_end, incompressible_hint),
3308                super::strategy::BackendTag::Row => self.row_matcher_mut().skip_matching_borrowed(
3309                    block_start,
3310                    block_end,
3311                    incompressible_hint,
3312                ),
3313                super::strategy::BackendTag::HashChain => self
3314                    .hc_matcher_mut()
3315                    .skip_matching_borrowed(block_start, block_end, incompressible_hint),
3316            }
3317            return;
3318        }
3319        match self.active_backend() {
3320            super::strategy::BackendTag::Simple => {
3321                self.simple_mut()
3322                    .skip_matching_with_hint(incompressible_hint);
3323                self.recycle_simple_space();
3324            }
3325            super::strategy::BackendTag::Dfast => {
3326                self.dfast_matcher_mut().skip_matching(incompressible_hint)
3327            }
3328            super::strategy::BackendTag::Row => self
3329                .row_matcher_mut()
3330                .skip_matching_with_hint(incompressible_hint),
3331            super::strategy::BackendTag::HashChain => {
3332                self.hc_matcher_mut().skip_matching(incompressible_hint)
3333            }
3334        }
3335    }
3336}
3337
3338impl MatchGeneratorDriver {
3339    /// Monomorphised optimal-parser entry point. Only the `BinaryTree`
3340    /// search arm of [`Matcher::start_matching`] routes here, selecting
3341    /// the concrete opt `S: Strategy` (BtOpt / BtUltra / BtUltra2) off
3342    /// `strategy_tag`, so the optimiser keeps the cost-model predicates
3343    /// (`S::USE_BT` / `S::USE_HASH3` / `S::ACCURATE_PRICE` /
3344    /// `S::TWO_PASS_SEED`) const-folded per strategy. The non-opt search
3345    /// backends (Fast / DoubleFast / RowHash / HashChain) are dispatched
3346    /// directly off the search axis and never reach this method, so all
3347    /// strategies arriving here are HashChain-backed.
3348    fn compress_block<S: super::strategy::Strategy>(
3349        &mut self,
3350        handle_sequence: &mut impl for<'a> FnMut(Sequence<'a>),
3351    ) {
3352        debug_assert_eq!(S::BACKEND, super::strategy::BackendTag::HashChain);
3353        debug_assert!(
3354            S::USE_BT,
3355            "compress_block only handles the optimal (BT) path"
3356        );
3357        self.hc_matcher_mut()
3358            .start_matching_strategy::<S>(handle_sequence);
3359    }
3360}
3361
3362/// Stage D: backend storage discriminator.
3363///
3364/// HC (lazy / lazy2) modes carry no extra per-frame state beyond the
3365/// shared `MatchTable` and `HcMatcher` runtime knobs, so the
3366/// [`HcBackend::Hc`] variant is zero-sized — no BT scratch is
3367/// allocated. BT-flavoured modes (`btopt` / `btultra` / `btultra2`)
3368/// hold the full [`super::bt::BtMatcher`] inside the
3369/// [`HcBackend::Bt`] variant (cost model, optimal-parser scratch
3370/// arenas, LDM candidate buffer).
3371///
3372/// The discriminator lives next to `parse_mode` so `configure()` can
3373/// promote between the two on a level change without touching the
3374/// `MatchTable` storage.
3375#[derive(Clone)]
3376pub(crate) enum HcBackend {
3377    /// Lazy / lazy2 modes — no per-frame backend state.
3378    Hc,
3379    /// BT-driven modes — owns the optimal parser's per-frame scratch.
3380    /// Boxed so the enum stays pointer-sized: HC-only matchers pay
3381    /// just the `Box`-niche, not the 4 KiB `BtMatcher` payload.
3382    Bt(alloc::boxed::Box<super::bt::BtMatcher>),
3383}
3384
3385impl HcBackend {
3386    /// Heap bytes held by the backend. `Hc` is zero-sized; `Bt` boxes a
3387    /// `BtMatcher`, so count the boxed payload plus its own scratch heap.
3388    fn heap_size(&self) -> usize {
3389        match self {
3390            Self::Hc => 0,
3391            Self::Bt(bt) => core::mem::size_of::<super::bt::BtMatcher>() + bt.heap_size(),
3392        }
3393    }
3394
3395    /// Mutable accessor on the BT matcher; panics if the active
3396    /// backend is `Hc`. The HC-or-Bt branches in orchestrator code use
3397    /// `let HcBackend::Bt(bt) = &self.backend` directly for readonly
3398    /// access — this helper exists so macro bodies that already drive
3399    /// a mutable BT update through the optimal parser can write
3400    /// `$self.backend.bt_mut().X` without an outer `match` ladder.
3401    #[inline(always)]
3402    pub(crate) fn bt_mut(&mut self) -> &mut super::bt::BtMatcher {
3403        match self {
3404            Self::Bt(bt) => bt,
3405            Self::Hc => unreachable!("BT-only accessor called in HC mode"),
3406        }
3407    }
3408}
3409
3410#[derive(Clone)]
3411struct HcMatchGenerator {
3412    /// Shared match-finder storage (window, history, hash / chain /
3413    /// hash3 tables, dictionary-priming flags). Used identically by HC
3414    /// and BT modes; backend-specific table interpretation lives in the
3415    /// matcher methods on this struct.
3416    table: super::match_table::storage::MatchTable,
3417    /// HC runtime knobs (lazy_depth, search_depth, target_len). Always
3418    /// present — BT modes still consult `hc.search_depth` for repcode
3419    /// probing and chain candidate enumeration.
3420    hc: super::hc::HcMatcher,
3421    /// Backend discriminator. [`HcBackend::Hc`] is zero-sized for the
3422    /// lazy / lazy2 path so HC-only generators don't carry the BT
3423    /// optimal-parser scratch buffers. [`HcBackend::Bt`] holds the
3424    /// `BtMatcher` when an optimal mode is configured.
3425    backend: HcBackend,
3426    /// Compile-time strategy tag mirrored from
3427    /// [`MatchGeneratorDriver::strategy_tag`] during `configure()`.
3428    /// The driver hot path never reads this — it dispatches to
3429    /// `compress_block::<S>` from its own tag — but the
3430    /// `#[cfg(test)] start_matching` helper consumes it so artificial
3431    /// test setups still pick the correct concrete `S` for the
3432    /// const-generic optimal parser (BtOpt vs BtUltra vs BtUltra2).
3433    /// Without this field the test path would have to collapse
3434    /// `BtOpt` and `BtUltra` onto the same monomorphisation since
3435    /// `table.uses_bt` / `table.is_btultra2` alone can't tell them
3436    /// apart.
3437    strategy_tag: super::strategy::StrategyTag,
3438}
3439
3440// Plain-data types relocated to [`crate::encoding::opt::types`] and
3441// [`crate::encoding::opt::ldm`] by #111 Phase 1. The use statements at
3442// the top of this file bring them back into scope so the existing
3443// methods on `HcMatchGenerator` compile unchanged.
3444
3445/// `bt_insert_step_no_rebase` body parameterized over the per-CPU
3446/// `count_match_from_indices` symbol. Each kernel-specific wrapper invokes
3447/// the macro with its own `fastpath::<kernel>::count_match_from_indices`
3448/// path so the call resolves inside the wrapper's `#[target_feature]`
3449/// umbrella and inlines instead of paying the function-call ABI per BT walk
3450/// iteration. Used only by `HcMatchGenerator` BT walk wrappers below.
3451///
3452/// Crate-private: the macro body references private `encoding::*`
3453/// modules via `$crate::...`, so it is unusable downstream and is
3454/// re-exported only inside this crate via `pub(crate) use` below.
3455macro_rules! bt_insert_step_no_rebase_body {
3456    ($table:expr, $search_depth:expr, $abs_pos:ident, $current_abs_end:ident, $target_abs:ident, $cmf:path) => {{
3457        let idx = $abs_pos - $table.history_abs_start;
3458        // Borrowed-aware live region (owned: `history[history_start..]`;
3459        // borrowed: the in-place input `[0, block_end)`). Reborrow-then-raw-ptr
3460        // so the slice holds NO borrow and coexists with the `&mut $table`
3461        // binary-tree writes below. Owned is byte-identical (same bytes).
3462        let concat: &[u8] = unsafe {
3463            let lh = $table.live_history();
3464            core::slice::from_raw_parts(lh.as_ptr(), lh.len())
3465        };
3466        if idx + 8 > concat.len() {
3467            return 1;
3468        }
3469        debug_assert!(
3470            $abs_pos <= $current_abs_end,
3471            "BT walker called past current block end"
3472        );
3473        let tail_limit = $current_abs_end - $abs_pos;
3474        let hash = $crate::encoding::match_table::storage::MatchTable::hash_position_at(
3475            concat,
3476            idx,
3477            $table.hash_log,
3478            $table.search_mls,
3479        );
3480        // Prefetch the hash bucket now. For the large L16+ hash table over
3481        // high-entropy input the bucket is L3/DRAM-cold, and unlike upstream's
3482        // monolithic ZSTD_btGetAllMatches (which overlaps this miss with its
3483        // inline rep/hash3 prologue) the read+write of `hash_table[hash]`
3484        // below is reached with nothing to hide it behind — it stalled a large
3485        // share of this function's cycles. Issuing the hint here lets the miss
3486        // overlap the address setup that follows.
3487        #[cfg(all(
3488            target_feature = "sse",
3489            any(target_arch = "x86", target_arch = "x86_64")
3490        ))]
3491        {
3492            #[cfg(target_arch = "x86")]
3493            use core::arch::x86::{_MM_HINT_T0, _mm_prefetch};
3494            #[cfg(target_arch = "x86_64")]
3495            use core::arch::x86_64::{_MM_HINT_T0, _mm_prefetch};
3496            // SAFETY: prefetch is a hint that never faults; `hash` indexes
3497            // `hash_table` directly below, so it is in bounds.
3498            unsafe {
3499                _mm_prefetch($table.hash_table.as_ptr().add(hash).cast(), _MM_HINT_T0);
3500            }
3501            // Prefetch the NEXT position's bucket too. The optimal-parser DP
3502            // advances one position per iteration, so this miss is issued a
3503            // full BT walk plus the next iteration's pre-collect work ahead of
3504            // the collect that will read it — far more lead than the same-call
3505            // hint above, enough to hide the full DRAM latency.
3506            if idx + 1 + 8 <= concat.len() {
3507                let hash_next =
3508                    $crate::encoding::match_table::storage::MatchTable::hash_position_at(
3509                        concat,
3510                        idx + 1,
3511                        $table.hash_log,
3512                        $table.search_mls,
3513                    );
3514                // SAFETY: prefetch never faults; an out-of-range index is a
3515                // harmless no-op hint.
3516                unsafe {
3517                    _mm_prefetch(
3518                        $table.hash_table.as_ptr().add(hash_next).cast(),
3519                        _MM_HINT_T0,
3520                    );
3521                }
3522            }
3523        }
3524        let Some(relative_pos) = $table.relative_position($abs_pos) else {
3525            return 1;
3526        };
3527        let stored = relative_pos + 1;
3528        let bt_mask = $table.bt_mask();
3529        // `abs_pos < bt_mask` legitimately happens for the first BT walk of
3530        // a fresh frame (bt_low effectively "no floor"). Saturating keeps
3531        // the floor at 0 so the `candidate_abs <= bt_low` check never
3532        // triggers early; raw subtraction would underflow into a huge
3533        // sentinel that ALWAYS triggers.
3534        let bt_low = $abs_pos.saturating_sub(bt_mask);
3535        // Hoist the BT pointer-pair base out of `self` once — see the
3536        // collect-matches body for the full rationale (per-step Vec reload +
3537        // bounds check through `&mut self` vs the upstream zstd's raw `U32*` walk).
3538        let chain_ptr = $table.chain_table.as_mut_ptr();
3539        debug_assert_eq!($table.chain_table.len(), 2 << $table.bt_log());
3540        let window_low = $table.window_low_abs_for_target($target_abs);
3541        // `abs_pos + 9` is safe in raw form: `MatchTable::add_data` caps
3542        // total input at `usize::MAX - STREAM_ABS_HEADROOM` (where
3543        // `STREAM_ABS_HEADROOM = HC_OPT_NUM + 16`), so every
3544        // frame-lifetime absolute cursor passed to the BT walker stays
3545        // below `usize::MAX - 9` regardless of stream length or
3546        // pointer width. The guard is hoisted to the data-ingest
3547        // boundary so this per-position site pays zero arithmetic
3548        // overhead in the hot loop.
3549        let mut match_end_abs = $abs_pos + 9;
3550        let mut best_len = 8usize;
3551        let mut compares_left = $search_depth;
3552        let mut common_length_smaller = 0usize;
3553        let mut common_length_larger = 0usize;
3554        let pair_idx = $table.bt_pair_index_for_abs($abs_pos);
3555        let mut smaller_slot = pair_idx;
3556        let mut larger_slot = pair_idx + 1;
3557        let mut match_stored = $table.hash_table[hash];
3558        $table.hash_table[hash] = stored;
3559
3560        while compares_left > 0 {
3561            if match_stored == $crate::encoding::match_table::storage::HC_EMPTY {
3562                break;
3563            }
3564            // Reject stale post-rebase slots whose pre-shift position is below
3565            // `index_shift` explicitly. A `wrapping_sub` maps such a slot to a
3566            // near-`usize::MAX` value that the `>= abs_pos` test only rejects
3567            // while `abs_pos` is far from the integer ceiling; on a
3568            // long-running rebased stream (reachable on 32-bit) `abs_pos` can
3569            // approach the ceiling and the wrapped value can land back inside
3570            // `[window_low, abs_pos)`. `checked_sub` ends the walk on the
3571            // underflow instead. `match_stored != HC_EMPTY` here, so the `- 1`
3572            // cannot underflow.
3573            let Some(candidate_abs) = ($table.position_base + (match_stored as usize - 1))
3574                .checked_sub($table.index_shift)
3575            else {
3576                break;
3577            };
3578            if candidate_abs < window_low || candidate_abs >= $abs_pos {
3579                break;
3580            }
3581            compares_left -= 1;
3582
3583            let next_pair_idx = $table.bt_pair_index_for_abs(candidate_abs);
3584            // SAFETY: `next_pair_idx (+1)` = `2*(candidate_abs & bt_mask) (+1)`
3585            // ≤ `chain_table.len()-1`; `chain_ptr` is the hoisted live base,
3586            // table not realloc'd during the walk.
3587            let next_smaller = unsafe { *chain_ptr.add(next_pair_idx) };
3588            let next_larger = unsafe { *chain_ptr.add(next_pair_idx + 1) };
3589            let seed_len = common_length_smaller.min(common_length_larger);
3590            let candidate_idx = candidate_abs - $table.history_abs_start;
3591            // SAFETY: BT walk invariant — `candidate_idx + tail_limit ≤
3592            // concat.len()` since the candidate is within
3593            // `[history_abs_start, abs_pos)` and `tail_limit ≤
3594            // current_abs_end - abs_pos`.
3595            let match_len = unsafe { $cmf(concat, idx, candidate_idx, tail_limit, seed_len) };
3596
3597            if match_len > best_len {
3598                best_len = match_len;
3599                // `candidate_abs + match_len <= current_abs_end` by BT walk
3600                // invariant — `match_len <= tail_limit = current_abs_end -
3601                // abs_pos` and `candidate_abs < abs_pos`.
3602                let candidate_end = candidate_abs + match_len;
3603                if candidate_end > match_end_abs {
3604                    match_end_abs = candidate_end;
3605                }
3606            }
3607
3608            if match_len >= tail_limit {
3609                break;
3610            }
3611
3612            let candidate_next = candidate_idx + match_len;
3613            let current_next = idx + match_len;
3614            // SAFETY: first-differing positions after a match_len-long prefix;
3615            // match_len < tail_limit (break above) + BT-walk bound
3616            // idx/candidate_idx + tail_limit <= concat.len() keep both in range.
3617            if unsafe {
3618                *concat.get_unchecked(candidate_next) < *concat.get_unchecked(current_next)
3619            } {
3620                // SAFETY: `smaller_slot` holds a valid pair index (init
3621                // `pair_idx`, updated to `next_pair_idx + 1`); the `usize::MAX`
3622                // sentinel is set only just before `break`, never written here.
3623                unsafe { *chain_ptr.add(smaller_slot) = match_stored };
3624                common_length_smaller = match_len;
3625                if candidate_abs <= bt_low {
3626                    smaller_slot = usize::MAX;
3627                    break;
3628                }
3629                smaller_slot = next_pair_idx + 1;
3630                match_stored = next_larger;
3631            } else {
3632                // SAFETY: as above for `larger_slot`.
3633                unsafe { *chain_ptr.add(larger_slot) = match_stored };
3634                common_length_larger = match_len;
3635                if candidate_abs <= bt_low {
3636                    larger_slot = usize::MAX;
3637                    break;
3638                }
3639                larger_slot = next_pair_idx;
3640                match_stored = next_smaller;
3641            }
3642        }
3643
3644        // SAFETY: both slots, when not the `usize::MAX` sentinel, hold valid
3645        // pair indices into the hoisted `chain_table` base.
3646        if smaller_slot != usize::MAX {
3647            unsafe {
3648                *chain_ptr.add(smaller_slot) = $crate::encoding::match_table::storage::HC_EMPTY
3649            };
3650        }
3651        if larger_slot != usize::MAX {
3652            unsafe {
3653                *chain_ptr.add(larger_slot) = $crate::encoding::match_table::storage::HC_EMPTY
3654            };
3655        }
3656
3657        let speed_positions = if best_len > 384 {
3658            (best_len - 384).min(192)
3659        } else {
3660            0
3661        };
3662        // `match_end_abs` is initialized to `abs_pos + 9` and is only
3663        // reassigned inside the `candidate_end > match_end_abs` branch
3664        // above. So even though an individual `candidate_end =
3665        // candidate_abs + match_len` can land below `abs_pos` (the
3666        // candidate sits earlier in history and the match runs short),
3667        // the variable itself never drops below its initial value.
3668        // That gives `match_end_abs ≥ abs_pos + 9 > abs_pos + 8` as a
3669        // loop-wide invariant, so the raw subtraction below cannot
3670        // underflow.
3671        speed_positions.max(match_end_abs - ($abs_pos + 8))
3672    }};
3673}
3674pub(crate) use bt_insert_step_no_rebase_body;
3675
3676/// `build_optimal_plan_impl` body parameterized over the per-CPU
3677/// `collect_optimal_candidates_initialized_<kernel>` method name. Caller
3678/// passes its `&mut self`, the seven DP entry-point arguments, and the
3679/// kernel-specific collect method. Each per-kernel wrapper invokes this
3680/// macro inside its own `#[target_feature]` umbrella so the per-position
3681/// `$collect` call inlines and the entire DP loop runs as one straight-line
3682/// hot path without an ABI barrier between the DP and the match-gathering
3683/// pipeline.
3684///
3685/// Body is ~730 lines but mechanically identical across kernels — the macro
3686/// keeps a single source of truth. The two const generics
3687/// (`ACCURATE_PRICE`, `FAVOR_SMALL_OFFSETS`) come from the wrapper's
3688/// generic parameter list and are referenced as bare identifiers; macro
3689/// hygiene resolves them at the expansion site.
3690/// Upstream zstd `offBase` for the btlazy2 lazy gain heuristic: a match whose offset
3691/// equals one of the three active repeat offsets prices as the cheap repcode
3692/// code (1/2/3); any other offset prices as `offset + 3`. So an equal-length
3693/// repeat-offset match always out-gains an explicit-offset one
3694/// (`zstd_lazy.c` `ZSTD_storeSeq` offBase convention).
3695#[inline]
3696fn btlazy2_offbase(offset: usize, reps: [u32; 3], ll0: bool) -> u32 {
3697    let o = offset as u32;
3698    // Upstream zstd repcode mapping shifts by `ll0` (zero-literal position): the cheap
3699    // codes become rep1 / rep2 / (rep0 - 1) instead of rep0 / rep1 / rep2,
3700    // because at ll0 an offset equal to rep0 is the special rep0-1 case, not
3701    // repcode 1. Scoring offsets against the wrong code at ll0 over-rewards a
3702    // rep0-distance match that does not actually encode as the cheapest code.
3703    if ll0 {
3704        if o == reps[1] {
3705            1
3706        } else if o == reps[2] {
3707            2
3708        } else if reps[0] > 1 && o == reps[0] - 1 {
3709            3
3710        } else {
3711            // Offsets are < window (<= 2^27), so `+ 3` never overflows u32.
3712            o + 3
3713        }
3714    } else if o == reps[0] {
3715        1
3716    } else if o == reps[1] {
3717        2
3718    } else if o == reps[2] {
3719        3
3720    } else {
3721        // Offsets are < window (<= 2^27), so `+ 3` never overflows u32.
3722        o + 3
3723    }
3724}
3725
3726/// Upstream zstd lazy match gain (`matchLength * 4 - ZSTD_highbit32(offBase)`): the
3727/// selection metric that lets a shorter repeat-offset match beat a longer
3728/// explicit-offset one. `offBase >= 1`, so `highbit` is well-defined.
3729#[inline]
3730fn btlazy2_gain(match_len: usize, offset: usize, reps: [u32; 3], ll0: bool) -> i64 {
3731    let offbase = btlazy2_offbase(offset, reps, ll0);
3732    (match_len as i64) * 4 - (31 - offbase.leading_zeros()) as i64
3733}
3734
3735/// Per-kernel body of the `btlazy2` (levels 13-15) greedy/lazy parse over
3736/// the binary-tree match finder. Mirrors `build_optimal_plan_impl_body!`'s
3737/// kernel-dispatch discipline: the wrapper carries the `#[target_feature]`
3738/// umbrella and passes its tier-specific `collect_optimal_candidates_initialized_<kernel>`
3739/// as `$collect`, so the per-position BT collect (and its inlined cpl)
3740/// stays under one umbrella — the runtime `select_kernel()` dispatch happens
3741/// ONCE per block in the bare `start_matching_btlazy2`, never per position.
3742macro_rules! start_matching_btlazy2_body {
3743    ($self:ident, $handle_sequence:ident, $collect:ident, $cmf:path $(,)?) => {{
3744        $self.table.ensure_tables();
3745        // Borrowed-aware: owned → last committed chunk; borrowed → staged block.
3746        let (current_abs_start, current_len) = $self.table.current_block_range();
3747        if current_len == 0 {
3748            return;
3749        }
3750        let current_ptr = $self.table.get_last_space().as_ptr();
3751        // Mutates tables but never reallocates `history`, so this tail slice
3752        // stays valid for the routine's duration (same as the other parsers).
3753        let current: &[u8] = unsafe { core::slice::from_raw_parts(current_ptr, current_len) };
3754        // Full contiguous live region (owned: dict + prior blocks + current
3755        // block in `history`; borrowed: `[0, block_end)` of the in-place
3756        // input) as a raw slice, for the explicit repcode probe: a rep offset
3757        // can point before the current block, which `current` can't reach.
3758        // `live_history()` is borrowed-aware; reborrow-then-raw-ptr so the
3759        // slice holds NO borrow and coexists with the `&mut self` collector
3760        // calls below. Same no-realloc validity contract as `current`.
3761        let history_abs_start = $self.table.history_abs_start;
3762        let concat_full: &[u8] = unsafe {
3763            let lh = $self.table.live_history();
3764            core::slice::from_raw_parts(lh.as_ptr(), lh.len())
3765        };
3766        let current_abs_end = current_abs_start + current_len;
3767        $self
3768            .table
3769            .apply_limited_update_after_long_match(current_abs_start);
3770        $self
3771            .table
3772            .backfill_boundary_positions(current_abs_start, current_abs_end);
3773
3774        let profile = HcOptimalCostProfile::const_for_strategy::<super::strategy::Btlazy2>();
3775        let mut candidates = core::mem::take(&mut $self.backend.bt_mut().opt_candidates_scratch);
3776
3777        let depth = $self.hc.lazy_depth as usize;
3778        let mut pos = 0usize;
3779        let mut literals_start = 0usize;
3780
3781        // Collect + select the highest-GAIN match at a position (upstream zstd
3782        // `ZSTD_searchMax` plus the explicit offset_1 repcode check): scan the
3783        // length-sorted BT/dms ladder by gain, then probe rep0 directly since
3784        // the ladder's strictly-increasing-length filter drops short cheap
3785        // reps. Expands to `(match_len, offset)`; `match_len == 0` = no match.
3786        macro_rules! bt_select {
3787            ($p:expr) => {{
3788                let sel_pos: usize = $p;
3789                // `ll0` (upstream zstd): zero literals pending before this position, so
3790                // the repcode set is shifted (see `btlazy2_offbase`).
3791                let ll0 = sel_pos == literals_start;
3792                let sel_abs = current_abs_start + sel_pos;
3793                candidates.clear();
3794                let query = HcCandidateQuery {
3795                    reps: $self.table.offset_hist,
3796                    lit_len: sel_pos - literals_start,
3797                    // No LDM seed: L13-15 run at windowLog 22, below upstream zstd's
3798                    // LDM auto-enable threshold (windowLog >= 27).
3799                    ldm_candidate: None,
3800                };
3801                // SAFETY: called inside the wrapper's `#[target_feature]`
3802                // umbrella (the scalar wrapper's `$collect` is a safe fn).
3803                unsafe {
3804                    $self.$collect::<super::strategy::Btlazy2, true>(
3805                        sel_abs,
3806                        current_abs_end,
3807                        profile,
3808                        query,
3809                        &mut candidates,
3810                    );
3811                }
3812                let reps = $self.table.offset_hist;
3813                let mut sel_ml = 0usize;
3814                let mut sel_off = 0usize;
3815                let mut sel_gain = i64::MIN;
3816                for c in candidates.iter() {
3817                    let ml = c.match_len.min(current_len - sel_pos);
3818                    if ml < HC_OPT_MIN_MATCH_LEN {
3819                        continue;
3820                    }
3821                    let g = btlazy2_gain(ml, c.offset, reps, ll0);
3822                    if g > sel_gain {
3823                        sel_gain = g;
3824                        sel_ml = ml;
3825                        sel_off = c.offset;
3826                    }
3827                }
3828                let sel_idx = sel_abs - history_abs_start;
3829                // Upstream zstd probes `rep[0 + ll0]` directly (the length-sorted ladder
3830                // drops short cheap reps): rep0 normally, rep1 at a zero-literal
3831                // position where rep0 is not the cheapest code.
3832                let probe_rep = if ll0 {
3833                    reps[1] as usize
3834                } else {
3835                    reps[0] as usize
3836                };
3837                if probe_rep != 0 && sel_idx >= probe_rep {
3838                    let tail = current_len - sel_pos;
3839                    // SAFETY: `sel_idx - probe_rep < sel_idx`, `sel_idx + tail <=
3840                    // concat_full.len()`; same overshoot slack the collector
3841                    // relies on for this block.
3842                    let rep_ml =
3843                        unsafe { $cmf(concat_full, sel_idx, sel_idx - probe_rep, tail, 0) };
3844                    if rep_ml >= HC_OPT_MIN_MATCH_LEN
3845                        && btlazy2_gain(rep_ml, probe_rep, reps, ll0) > sel_gain
3846                    {
3847                        sel_ml = rep_ml;
3848                        sel_off = probe_rep;
3849                    }
3850                }
3851                (sel_ml, sel_off)
3852            }};
3853        }
3854
3855        while pos + HC_OPT_MIN_MATCH_LEN <= current_len {
3856            let (mut best_ml, mut best_off) = bt_select!(pos);
3857            if best_ml < HC_OPT_MIN_MATCH_LEN {
3858                pos += 1;
3859                continue;
3860            }
3861            // Lazy lookahead (upstream zstd depth 1/2): advance one byte and accept the
3862            // later match only if it out-gains the current one by the upstream zstd
3863            // margin (deferring costs an extra literal — `+4` at depth 1, `+7`
3864            // at depth 2). `start` tracks where the chosen match begins.
3865            let mut start = pos;
3866            let mut d = 0usize;
3867            while d < depth && start + 1 + HC_OPT_MIN_MATCH_LEN <= current_len {
3868                let look = start + 1;
3869                let (ml2, off2) = bt_select!(look);
3870                if ml2 < HC_OPT_MIN_MATCH_LEN {
3871                    break;
3872                }
3873                let reps = $self.table.offset_hist;
3874                let margin = if d == 0 { 4 } else { 7 };
3875                // `best` sits at `start` (ll0 iff no literals precede it); the
3876                // lookahead match at `start + 1` always has a pending literal.
3877                let gain1 = btlazy2_gain(best_ml, best_off, reps, start == literals_start) + margin;
3878                let gain2 = btlazy2_gain(ml2, off2, reps, false);
3879                if gain2 > gain1 {
3880                    best_ml = ml2;
3881                    best_off = off2;
3882                    start = look;
3883                    d += 1;
3884                } else {
3885                    break;
3886                }
3887            }
3888            // Commit the chosen match at `start`; [literals_start, start) is
3889            // emitted as literals. `best_ml` was bounded to `current_len -
3890            // start` at selection, so `start + best_ml <= current_len`.
3891            let lit_len = start - literals_start;
3892            let literals = &current[literals_start..start];
3893            $handle_sequence(Sequence::Triple {
3894                literals,
3895                offset: best_off,
3896                match_len: best_ml,
3897            });
3898            let _ = encode_offset_with_history(
3899                best_off as u32,
3900                lit_len as u32,
3901                &mut $self.table.offset_hist,
3902            );
3903            pos = start + best_ml;
3904            literals_start = pos;
3905        }
3906
3907        if literals_start < current_len {
3908            $handle_sequence(Sequence::Literals {
3909                literals: &current[literals_start..],
3910            });
3911        }
3912        $self.backend.bt_mut().opt_candidates_scratch = candidates;
3913    }};
3914}
3915
3916/// 8-lane `next_cost < node_price` mask for the optimal-parser price-set
3917/// loop. AVX2 lacks an unsigned `cmplt`, so derive `nc < np` from
3918/// `min_epu32`: `nc <= np` iff `min(nc,np) == nc`, then exclude equality.
3919/// Returns a bitmask (bit `k` set => lane `k` improves). Scalar fallback
3920/// for non-x86 / no-AVX2.
3921/// 8-lane `next_cost < node_price` mask for the optimal-parser price-set
3922/// loop. AVX2 lacks an unsigned `cmplt`, so derive `nc < np` from
3923/// `min_epu32`: `nc <= np` iff `min(nc,np) == nc`, then exclude equality.
3924/// Returns a bitmask (bit `k` set => lane `k` improves). Compiled on every
3925/// x86 target (same as the avx2 collect kernel); the cargo `kernel_avx2`
3926/// feature only gates the runtime dispatch, not compilation.
3927#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
3928#[target_feature(enable = "avx2")]
3929unsafe fn priceset_improved_mask8_avx2(next_cost: &[u32; 8], node_price: &[u32]) -> u8 {
3930    #[cfg(target_arch = "x86")]
3931    use core::arch::x86::{
3932        __m256i, _mm256_andnot_si256, _mm256_castsi256_ps, _mm256_cmpeq_epi32, _mm256_loadu_si256,
3933        _mm256_min_epu32, _mm256_movemask_ps,
3934    };
3935    #[cfg(target_arch = "x86_64")]
3936    use core::arch::x86_64::{
3937        __m256i, _mm256_andnot_si256, _mm256_castsi256_ps, _mm256_cmpeq_epi32, _mm256_loadu_si256,
3938        _mm256_min_epu32, _mm256_movemask_ps,
3939    };
3940    let nc = unsafe { _mm256_loadu_si256(next_cost.as_ptr() as *const __m256i) };
3941    let np = unsafe { _mm256_loadu_si256(node_price.as_ptr() as *const __m256i) };
3942    let min = _mm256_min_epu32(nc, np);
3943    let le = _mm256_cmpeq_epi32(min, nc); // nc <= np
3944    let eq = _mm256_cmpeq_epi32(nc, np); // nc == np
3945    let lt = _mm256_andnot_si256(eq, le); // nc < np
3946    _mm256_movemask_ps(_mm256_castsi256_ps(lt)) as u8
3947}
3948
3949/// Inline `next_cost = base_cost + ll0_price + match_price_from_parts(off,ml)`
3950/// for one match length — the exact `add_prices` chain the scalar loop uses,
3951/// so the SoA vector path stays byte-identical.
3952#[inline(always)]
3953#[allow(clippy::too_many_arguments)]
3954fn priceset_next_cost(
3955    profile: HcOptimalCostProfile,
3956    stats: &HcOptState,
3957    ml_cache: &mut [[u32; 2]],
3958    ml_stamp: u32,
3959    match_len: usize,
3960    ll0_price: u32,
3961    off_price: u32,
3962    base_cost: u32,
3963) -> u32 {
3964    let ml_price =
3965        BtMatcher::cached_match_length_price(profile, stats, match_len, ml_cache, ml_stamp);
3966    let seq_cost = BtMatcher::add_prices(
3967        ll0_price,
3968        profile.match_price_from_parts(off_price, ml_price, stats),
3969    );
3970    BtMatcher::add_prices(base_cost, seq_cost)
3971}
3972
3973/// Scalar price-set over the match-length range `[start, max]` for the
3974/// NON-abort optimal modes (btultra / btultra2). Each `match_len` writes a
3975/// distinct node `pos + match_len`, so order is irrelevant; the improvement
3976/// test reduces to `next_cost < node_prices[next]` (`reset_opt_nodes` set
3977/// every beyond-frontier cell to `u32::MAX`, subsuming `next > last_pos`).
3978/// `#[inline]` so it folds into each per-tier optimal-parser monomorphisation
3979/// (no call overhead). Returns the highest written `next`.
3980#[inline]
3981#[allow(clippy::too_many_arguments)]
3982// Used by the scalar / sse42 DP wrappers; on aarch64 the dispatch only reaches
3983// the neon wrapper and on wasm+simd128 only the simd128 wrapper, so this is
3984// cfg-dead on those targets.
3985#[cfg_attr(
3986    any(
3987        all(target_arch = "aarch64", target_endian = "little"),
3988        all(target_arch = "wasm32", target_feature = "simd128")
3989    ),
3990    allow(dead_code)
3991)]
3992fn priceset_range_nonabort_scalar(
3993    node_prices: &mut [u32],
3994    nodes: &mut [HcOptimalNode],
3995    ml_cache: &mut [[u32; 2]],
3996    ml_stamp: u32,
3997    profile: HcOptimalCostProfile,
3998    stats: &HcOptState,
3999    pos: usize,
4000    start: usize,
4001    max: usize,
4002    ll0_price: u32,
4003    off_price: u32,
4004    base_cost: u32,
4005    off: u32,
4006    reps: [u32; 3],
4007    last_pos: usize,
4008) -> usize {
4009    let mut new_last = last_pos;
4010    for ml in start..=max {
4011        let next_cost = priceset_next_cost(
4012            profile, stats, ml_cache, ml_stamp, ml, ll0_price, off_price, base_cost,
4013        );
4014        let next = pos + ml;
4015        if next_cost < node_prices[next] {
4016            node_prices[next] = next_cost;
4017            nodes[next] = HcOptimalNode {
4018                off,
4019                mlen: ml as u32,
4020                litlen: 0,
4021                reps,
4022            };
4023            if next > new_last {
4024                new_last = next;
4025            }
4026        }
4027    }
4028    new_last
4029}
4030
4031/// Per-tier deinterleave + improve-mask correctness vs a scalar reference.
4032/// Each tier's dispatch only fires on matching hardware (i9 picks AVX2 over
4033/// SSE4.1, M1 picks NEON), so the non-dispatched tiers never run in the
4034/// roundtrip suite; this exercises the deinterleave/mask helpers directly on
4035/// whatever ISA the test host exposes (AVX2 + SSE4.1 on x86, NEON on aarch64).
4036#[cfg(test)]
4037#[test]
4038fn priceset_tier_helpers_match_scalar() {
4039    // Reference: gen-stamped contiguous cells -> ordered prices on all-warm.
4040    fn scalar_deint<const W: usize>(cells: &[[u32; 2]], stamp: u32) -> Option<[u32; W]> {
4041        let mut out = [0u32; W];
4042        for k in 0..W {
4043            if cells[k][1] != stamp {
4044                return None;
4045            }
4046            out[k] = cells[k][0];
4047        }
4048        Some(out)
4049    }
4050    fn scalar_mask<const W: usize>(nc: &[u32; W], np: &[u32]) -> u8 {
4051        let mut m = 0u8;
4052        for k in 0..W {
4053            if nc[k] < np[k] {
4054                m |= 1 << k;
4055            }
4056        }
4057        m
4058    }
4059    const S: u32 = 0x55;
4060    let warm: [[u32; 2]; 4] = [[11, S], [22, S], [33, S], [44, S]];
4061    let mut cold = warm;
4062    cold[2][1] = S ^ 1; // one stale cell -> must yield None
4063    let nc4: [u32; 4] = [10, 99, 30, 41];
4064    let np4: [u32; 4] = [20, 21, 30, 99]; // lt: lane0 (10<20), lane3 (41<99)
4065
4066    #[cfg(all(target_arch = "aarch64", target_endian = "little"))]
4067    unsafe {
4068        assert_eq!(
4069            priceset_cached_prices4_neon(&warm, S),
4070            scalar_deint::<4>(&warm, S)
4071        );
4072        assert_eq!(priceset_cached_prices4_neon(&cold, S), None);
4073        assert_eq!(
4074            priceset_improved_mask4_neon(&nc4, &np4),
4075            scalar_mask::<4>(&nc4, &np4)
4076        );
4077    }
4078    #[cfg(all(feature = "std", any(target_arch = "x86", target_arch = "x86_64")))]
4079    {
4080        if std::is_x86_feature_detected!("sse4.2") {
4081            unsafe {
4082                assert_eq!(
4083                    priceset_cached_prices4_sse41(&warm, S),
4084                    scalar_deint::<4>(&warm, S)
4085                );
4086                assert_eq!(priceset_cached_prices4_sse41(&cold, S), None);
4087                assert_eq!(
4088                    priceset_improved_mask4_sse41(&nc4, &np4),
4089                    scalar_mask::<4>(&nc4, &np4)
4090                );
4091            }
4092        }
4093        if std::is_x86_feature_detected!("avx2") {
4094            let warm8: [[u32; 2]; 8] = [
4095                [11, S],
4096                [22, S],
4097                [33, S],
4098                [44, S],
4099                [55, S],
4100                [66, S],
4101                [77, S],
4102                [88, S],
4103            ];
4104            let mut cold8 = warm8;
4105            cold8[5][1] = S ^ 1;
4106            let nc8: [u32; 8] = [10, 99, 30, 41, 99, 60, 99, 80];
4107            let np8: [u32; 8] = [20, 21, 30, 99, 50, 99, 70, 99];
4108            unsafe {
4109                assert_eq!(
4110                    priceset_cached_prices8_avx2(&warm8, S),
4111                    scalar_deint::<8>(&warm8, S)
4112                );
4113                assert_eq!(priceset_cached_prices8_avx2(&cold8, S), None);
4114                assert_eq!(
4115                    priceset_improved_mask8_avx2(&nc8, &np8),
4116                    scalar_mask::<8>(&nc8, &np8)
4117                );
4118            }
4119        }
4120    }
4121}
4122
4123/// Shared vectorised price-set loop body, generic over the SIMD width `W`.
4124/// The per-tier `deint` (vector-load plus deinterleave of `W` cached prices,
4125/// returning `Some` only on an all-warm chunk) and `mask` (per-tier
4126/// `next_cost` less-than `node_price` bitmask) are passed as zero-sized
4127/// `impl Fn`s. `#[inline(always)]` plus monomorphisation folds `deint` and
4128/// `mask` directly into each per-tier wrapper's `target_feature` umbrella, so
4129/// the intrinsics inline with no call ABI and no runtime feature detection.
4130/// Cold or out-of-cache chunks, and the sub-`W` remainder, fall back to the
4131/// scalar `priceset_next_cost` (which fills the cache); writes are
4132/// scalar-scatter on the improving lanes (1-8% of compares, per the
4133/// improve-ratio probe). Same signature tail as the scalar variant.
4134#[inline(always)]
4135#[allow(clippy::too_many_arguments)]
4136// Instantiated only by a vector tier wrapper (avx2/sse4.1 on x86, neon on
4137// aarch64, simd128 on wasm+simd128); a target with none of those (e.g.
4138// wasm without +simd128) uses only the scalar range, leaving this generic dead.
4139#[cfg_attr(
4140    not(any(
4141        target_arch = "x86",
4142        target_arch = "x86_64",
4143        all(target_arch = "aarch64", target_endian = "little"),
4144        all(target_arch = "wasm32", target_feature = "simd128")
4145    )),
4146    allow(dead_code)
4147)]
4148fn priceset_range_vec<const W: usize>(
4149    node_prices: &mut [u32],
4150    nodes: &mut [HcOptimalNode],
4151    ml_cache: &mut [[u32; 2]],
4152    ml_stamp: u32,
4153    profile: HcOptimalCostProfile,
4154    stats: &HcOptState,
4155    pos: usize,
4156    start: usize,
4157    max: usize,
4158    ll0_price: u32,
4159    off_price: u32,
4160    base_cost: u32,
4161    off: u32,
4162    reps: [u32; 3],
4163    last_pos: usize,
4164    deint: impl Fn(&[[u32; 2]], u32) -> Option<[u32; W]>,
4165    mask: impl Fn(&[u32; W], &[u32]) -> u8,
4166) -> usize {
4167    let mut new_last = last_pos;
4168    let mut buf = [0u32; W];
4169    // Loop-invariant constant of the byte-identical next_cost chain:
4170    // next_cost = add_prices(base_cost, add_prices(ll0_price,
4171    //   match_price_from_parts(off_price, ml_price))) = c_base + ml_price,
4172    // c_base = base_cost + ll0_price + match_price_from_parts(off_price, 0).
4173    //
4174    // This stays bit-exact with the scalar `priceset_next_cost` because both
4175    // helpers are affine in `ml_price`: `BtMatcher::add_prices(a, b) = a + b`
4176    // and `match_price_from_parts(off, ml) = off + ml + bias` are plain integer
4177    // additions, so `match_price_from_parts(off, ml) = match_price_from_parts(
4178    // off, 0) + ml` and the whole chain collapses to `c_base + ml_price`. The
4179    // `wrapping_add` here matches the scalar `+` under the cost model's
4180    // no-overflow invariant (the `debug_assert`s in both helpers). Factoring the
4181    // combine into one helper per the review suggestion would force a per-lane
4182    // `match_price_from_parts(off, ml_price)` recompute instead of hoisting the
4183    // ml-independent `c_base` once — a regression on this hot DP loop — so the
4184    // hoist is kept and the equivalence documented here instead.
4185    let c_base = base_cost
4186        .wrapping_add(ll0_price)
4187        .wrapping_add(profile.match_price_from_parts(off_price, 0, stats));
4188    let mut ml = start;
4189    while ml + W <= max + 1 {
4190        let vectorised = if ml + W <= ml_cache.len() {
4191            deint(&ml_cache[ml..ml + W], ml_stamp)
4192        } else {
4193            None
4194        };
4195        if let Some(prices) = vectorised {
4196            for (k, slot) in buf.iter_mut().enumerate() {
4197                *slot = c_base.wrapping_add(prices[k]);
4198            }
4199        } else {
4200            for (k, slot) in buf.iter_mut().enumerate() {
4201                *slot = priceset_next_cost(
4202                    profile,
4203                    stats,
4204                    ml_cache,
4205                    ml_stamp,
4206                    ml + k,
4207                    ll0_price,
4208                    off_price,
4209                    base_cost,
4210                );
4211            }
4212        }
4213        let base_next = pos + ml;
4214        let mut bits = mask(&buf, &node_prices[base_next..base_next + W]);
4215        while bits != 0 {
4216            let k = bits.trailing_zeros() as usize;
4217            bits &= bits - 1;
4218            let next = base_next + k;
4219            node_prices[next] = buf[k];
4220            nodes[next] = HcOptimalNode {
4221                off,
4222                mlen: (ml + k) as u32,
4223                litlen: 0,
4224                reps,
4225            };
4226            if next > new_last {
4227                new_last = next;
4228            }
4229        }
4230        ml += W;
4231    }
4232    while ml <= max {
4233        let next_cost = priceset_next_cost(
4234            profile, stats, ml_cache, ml_stamp, ml, ll0_price, off_price, base_cost,
4235        );
4236        let next = pos + ml;
4237        if next_cost < node_prices[next] {
4238            node_prices[next] = next_cost;
4239            nodes[next] = HcOptimalNode {
4240                off,
4241                mlen: ml as u32,
4242                litlen: 0,
4243                reps,
4244            };
4245            if next > new_last {
4246                new_last = next;
4247            }
4248        }
4249        ml += 1;
4250    }
4251    new_last
4252}
4253
4254/// Vector-load 8 cached ml-prices for the optimal parser's price-set, given a
4255/// run of 8 contiguous `[price, generation]` cells. Returns `Some(prices)`
4256/// only when ALL eight cells are warm (`generation == stamp`) — the common
4257/// (~91-98%) case — so the caller can fold them with one broadcast constant;
4258/// any cold cell returns `None` to route the chunk through the scalar fill
4259/// (which recomputes + repopulates the misses). Deinterleaves with cheap
4260/// in-128-lane ops (`shuffle_epi32` + `unpack*_epi64`) and a single cross-lane
4261/// `permute4x64` for the ordered prices — avoiding the latency-bound chain of
4262/// cross-lane `permutevar8x32`s that lost to pipelined scalar loads on
4263/// high-chunk-count fixtures.
4264#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
4265#[target_feature(enable = "avx2")]
4266#[inline]
4267unsafe fn priceset_cached_prices8_avx2(cells: &[[u32; 2]], stamp: u32) -> Option<[u32; 8]> {
4268    #[cfg(target_arch = "x86")]
4269    use core::arch::x86::{
4270        __m256i, _mm256_castsi256_ps, _mm256_cmpeq_epi32, _mm256_loadu_si256, _mm256_movemask_ps,
4271        _mm256_permute4x64_epi64, _mm256_set1_epi32, _mm256_shuffle_epi32, _mm256_storeu_si256,
4272        _mm256_unpackhi_epi64, _mm256_unpacklo_epi64,
4273    };
4274    #[cfg(target_arch = "x86_64")]
4275    use core::arch::x86_64::{
4276        __m256i, _mm256_castsi256_ps, _mm256_cmpeq_epi32, _mm256_loadu_si256, _mm256_movemask_ps,
4277        _mm256_permute4x64_epi64, _mm256_set1_epi32, _mm256_shuffle_epi32, _mm256_storeu_si256,
4278        _mm256_unpackhi_epi64, _mm256_unpacklo_epi64,
4279    };
4280    debug_assert!(cells.len() >= 8);
4281    let base = cells.as_ptr() as *const __m256i;
4282    // v0 = [p0 g0 p1 g1 | p2 g2 p3 g3], v1 = [p4 g4 p5 g5 | p6 g6 p7 g7].
4283    let v0 = unsafe { _mm256_loadu_si256(base) };
4284    let v1 = unsafe { _mm256_loadu_si256(base.add(1)) };
4285    // In-128-lane group prices then gens: [p g p g] -> [p p g g] (control 0xD8).
4286    let s0 = _mm256_shuffle_epi32(v0, 0xD8); // [p0 p1 g0 g1 | p2 p3 g2 g3]
4287    let s1 = _mm256_shuffle_epi32(v1, 0xD8); // [p4 p5 g4 g5 | p6 p7 g6 g7]
4288    // Gens (hi 64 of each 128-lane) — order irrelevant for the all-equal test.
4289    let gens = _mm256_unpackhi_epi64(s0, s1);
4290    let eq = _mm256_cmpeq_epi32(gens, _mm256_set1_epi32(stamp as i32));
4291    if _mm256_movemask_ps(_mm256_castsi256_ps(eq)) as u8 != 0xFF {
4292        return None;
4293    }
4294    // Prices (lo 64 of each 128-lane): [p0 p1 p4 p5 | p2 p3 p6 p7] as 64-bit
4295    // chunks [c0 c1 c2 c3] = [p0p1 p4p5 p2p3 p6p7]; reorder to [c0 c2 c1 c3]
4296    // (control 0xD8) for in-order [p0..p7].
4297    let p_scrambled = _mm256_unpacklo_epi64(s0, s1);
4298    let prices = _mm256_permute4x64_epi64(p_scrambled, 0xD8);
4299    let mut out = [0u32; 8];
4300    unsafe { _mm256_storeu_si256(out.as_mut_ptr() as *mut __m256i, prices) };
4301    Some(out)
4302}
4303
4304#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
4305#[target_feature(enable = "avx2")]
4306#[inline]
4307#[allow(clippy::too_many_arguments)]
4308unsafe fn priceset_range_nonabort_avx2(
4309    node_prices: &mut [u32],
4310    nodes: &mut [HcOptimalNode],
4311    ml_cache: &mut [[u32; 2]],
4312    ml_stamp: u32,
4313    profile: HcOptimalCostProfile,
4314    stats: &HcOptState,
4315    pos: usize,
4316    start: usize,
4317    max: usize,
4318    ll0_price: u32,
4319    off_price: u32,
4320    base_cost: u32,
4321    off: u32,
4322    reps: [u32; 3],
4323    last_pos: usize,
4324) -> usize {
4325    priceset_range_vec::<8>(
4326        node_prices,
4327        nodes,
4328        ml_cache,
4329        ml_stamp,
4330        profile,
4331        stats,
4332        pos,
4333        start,
4334        max,
4335        ll0_price,
4336        off_price,
4337        base_cost,
4338        off,
4339        reps,
4340        last_pos,
4341        // SAFETY: both closures run inside this fn's avx2 target_feature umbrella.
4342        |cells, stamp| unsafe { priceset_cached_prices8_avx2(cells, stamp) },
4343        |nc, np| unsafe { priceset_improved_mask8_avx2(nc, np) },
4344    )
4345}
4346
4347/// NEON 4-lane vector-load + deinterleave of cached ml-prices. `vld2q_u32`
4348/// deinterleaves the 4 contiguous `[price, generation]` pairs natively into
4349/// two registers (prices, gens) — no shuffle chain. `Some(prices)` only when
4350/// all 4 generations equal `stamp` (`vminvq` of the equality mask is all-ones).
4351#[cfg(all(target_arch = "aarch64", target_endian = "little"))]
4352#[target_feature(enable = "neon")]
4353#[inline]
4354unsafe fn priceset_cached_prices4_neon(cells: &[[u32; 2]], stamp: u32) -> Option<[u32; 4]> {
4355    use core::arch::aarch64::{vceqq_u32, vdupq_n_u32, vld2q_u32, vminvq_u32, vst1q_u32};
4356    debug_assert!(cells.len() >= 4);
4357    // SAFETY: caller's neon umbrella; `cells` is >= 4 pairs = 8 contiguous u32.
4358    let pair = unsafe { vld2q_u32(cells.as_ptr() as *const u32) };
4359    let eq = vceqq_u32(pair.1, vdupq_n_u32(stamp));
4360    if vminvq_u32(eq) != u32::MAX {
4361        return None;
4362    }
4363    let mut out = [0u32; 4];
4364    unsafe { vst1q_u32(out.as_mut_ptr(), pair.0) };
4365    Some(out)
4366}
4367
4368/// NEON 4-lane `next_cost < node_price` bitmask. NEON has an unsigned compare
4369/// (`vcltq_u32`) but no movemask; AND the all-ones lane mask with lane weights
4370/// `[1,2,4,8]` and horizontal-add (`vaddvq_u32`) to pack the 4 bits.
4371#[cfg(all(target_arch = "aarch64", target_endian = "little"))]
4372#[target_feature(enable = "neon")]
4373#[inline]
4374unsafe fn priceset_improved_mask4_neon(next_cost: &[u32; 4], node_price: &[u32]) -> u8 {
4375    use core::arch::aarch64::{vaddvq_u32, vandq_u32, vcltq_u32, vld1q_u32, vst1q_u32};
4376    // SAFETY: neon umbrella; both spans are 4 u32 wide.
4377    let nc = unsafe { vld1q_u32(next_cost.as_ptr()) };
4378    let np = unsafe { vld1q_u32(node_price.as_ptr()) };
4379    let lt = vcltq_u32(nc, np);
4380    let weights: [u32; 4] = [1, 2, 4, 8];
4381    let w = unsafe { vld1q_u32(weights.as_ptr()) };
4382    let bits = vandq_u32(lt, w);
4383    let _ = vst1q_u32; // silence unused import on some toolchains
4384    vaddvq_u32(bits) as u8
4385}
4386
4387#[cfg(all(target_arch = "aarch64", target_endian = "little"))]
4388#[target_feature(enable = "neon")]
4389#[inline]
4390#[allow(clippy::too_many_arguments)]
4391unsafe fn priceset_range_nonabort_neon(
4392    node_prices: &mut [u32],
4393    nodes: &mut [HcOptimalNode],
4394    ml_cache: &mut [[u32; 2]],
4395    ml_stamp: u32,
4396    profile: HcOptimalCostProfile,
4397    stats: &HcOptState,
4398    pos: usize,
4399    start: usize,
4400    max: usize,
4401    ll0_price: u32,
4402    off_price: u32,
4403    base_cost: u32,
4404    off: u32,
4405    reps: [u32; 3],
4406    last_pos: usize,
4407) -> usize {
4408    priceset_range_vec::<4>(
4409        node_prices,
4410        nodes,
4411        ml_cache,
4412        ml_stamp,
4413        profile,
4414        stats,
4415        pos,
4416        start,
4417        max,
4418        ll0_price,
4419        off_price,
4420        base_cost,
4421        off,
4422        reps,
4423        last_pos,
4424        // SAFETY: both closures run inside this fn's neon target_feature umbrella.
4425        |cells, stamp| unsafe { priceset_cached_prices4_neon(cells, stamp) },
4426        |nc, np| unsafe { priceset_improved_mask4_neon(nc, np) },
4427    )
4428}
4429
4430/// SSE4.1 4-lane vector-load + deinterleave of cached ml-prices. Two 128-bit
4431/// loads of `[price, gen]` pairs, `shuffle_epi32(0xD8)` groups prices then gens
4432/// within each, `unpacklo/hi_epi64` separates them. `Some(prices)` only when
4433/// all 4 generations equal `stamp`.
4434#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
4435#[target_feature(enable = "sse4.2")]
4436#[inline]
4437unsafe fn priceset_cached_prices4_sse41(cells: &[[u32; 2]], stamp: u32) -> Option<[u32; 4]> {
4438    #[cfg(target_arch = "x86")]
4439    use core::arch::x86::{
4440        __m128i, _mm_castsi128_ps, _mm_cmpeq_epi32, _mm_loadu_si128, _mm_movemask_ps,
4441        _mm_set1_epi32, _mm_shuffle_epi32, _mm_storeu_si128, _mm_unpackhi_epi64,
4442        _mm_unpacklo_epi64,
4443    };
4444    #[cfg(target_arch = "x86_64")]
4445    use core::arch::x86_64::{
4446        __m128i, _mm_castsi128_ps, _mm_cmpeq_epi32, _mm_loadu_si128, _mm_movemask_ps,
4447        _mm_set1_epi32, _mm_shuffle_epi32, _mm_storeu_si128, _mm_unpackhi_epi64,
4448        _mm_unpacklo_epi64,
4449    };
4450    debug_assert!(cells.len() >= 4);
4451    let base = cells.as_ptr() as *const __m128i;
4452    let v0 = unsafe { _mm_loadu_si128(base) }; // [p0 g0 p1 g1]
4453    let v1 = unsafe { _mm_loadu_si128(base.add(1)) }; // [p2 g2 p3 g3]
4454    let s0 = _mm_shuffle_epi32(v0, 0xD8); // [p0 p1 g0 g1]
4455    let s1 = _mm_shuffle_epi32(v1, 0xD8); // [p2 p3 g2 g3]
4456    let gens = _mm_unpackhi_epi64(s0, s1); // [g0 g1 g2 g3]
4457    let eq = _mm_cmpeq_epi32(gens, _mm_set1_epi32(stamp as i32));
4458    if _mm_movemask_ps(_mm_castsi128_ps(eq)) as u8 & 0x0F != 0x0F {
4459        return None;
4460    }
4461    let prices = _mm_unpacklo_epi64(s0, s1); // [p0 p1 p2 p3]
4462    let mut out = [0u32; 4];
4463    unsafe { _mm_storeu_si128(out.as_mut_ptr() as *mut __m128i, prices) };
4464    Some(out)
4465}
4466
4467/// SSE4.1 4-lane `next_cost < node_price` bitmask (unsigned compare via
4468/// `min_epu32`, like the AVX2 path).
4469#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
4470#[target_feature(enable = "sse4.2")]
4471#[inline]
4472unsafe fn priceset_improved_mask4_sse41(next_cost: &[u32; 4], node_price: &[u32]) -> u8 {
4473    #[cfg(target_arch = "x86")]
4474    use core::arch::x86::{
4475        __m128i, _mm_andnot_si128, _mm_castsi128_ps, _mm_cmpeq_epi32, _mm_loadu_si128,
4476        _mm_min_epu32, _mm_movemask_ps,
4477    };
4478    #[cfg(target_arch = "x86_64")]
4479    use core::arch::x86_64::{
4480        __m128i, _mm_andnot_si128, _mm_castsi128_ps, _mm_cmpeq_epi32, _mm_loadu_si128,
4481        _mm_min_epu32, _mm_movemask_ps,
4482    };
4483    let nc = unsafe { _mm_loadu_si128(next_cost.as_ptr() as *const __m128i) };
4484    let np = unsafe { _mm_loadu_si128(node_price.as_ptr() as *const __m128i) };
4485    let min = _mm_min_epu32(nc, np);
4486    let le = _mm_cmpeq_epi32(min, nc);
4487    let eq = _mm_cmpeq_epi32(nc, np);
4488    let lt = _mm_andnot_si128(eq, le);
4489    (_mm_movemask_ps(_mm_castsi128_ps(lt)) as u8) & 0x0F
4490}
4491
4492#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
4493#[target_feature(enable = "sse4.2")]
4494#[inline]
4495#[allow(clippy::too_many_arguments)]
4496unsafe fn priceset_range_nonabort_sse41(
4497    node_prices: &mut [u32],
4498    nodes: &mut [HcOptimalNode],
4499    ml_cache: &mut [[u32; 2]],
4500    ml_stamp: u32,
4501    profile: HcOptimalCostProfile,
4502    stats: &HcOptState,
4503    pos: usize,
4504    start: usize,
4505    max: usize,
4506    ll0_price: u32,
4507    off_price: u32,
4508    base_cost: u32,
4509    off: u32,
4510    reps: [u32; 3],
4511    last_pos: usize,
4512) -> usize {
4513    priceset_range_vec::<4>(
4514        node_prices,
4515        nodes,
4516        ml_cache,
4517        ml_stamp,
4518        profile,
4519        stats,
4520        pos,
4521        start,
4522        max,
4523        ll0_price,
4524        off_price,
4525        base_cost,
4526        off,
4527        reps,
4528        last_pos,
4529        // SAFETY: both closures run inside this fn's sse4.2 target_feature umbrella.
4530        |cells, stamp| unsafe { priceset_cached_prices4_sse41(cells, stamp) },
4531        |nc, np| unsafe { priceset_improved_mask4_sse41(nc, np) },
4532    )
4533}
4534
4535/// wasm `simd128` 4-lane vector-load + deinterleave of cached ml-prices.
4536/// `u32x4_shuffle` selects the price (even) and gen (odd) lanes across the two
4537/// loaded vectors natively. `Some(prices)` only when all 4 gens equal `stamp`
4538/// (`u32x4_all_true` of the equality vector).
4539#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
4540#[target_feature(enable = "simd128")]
4541#[inline]
4542unsafe fn priceset_cached_prices4_simd128(cells: &[[u32; 2]], stamp: u32) -> Option<[u32; 4]> {
4543    use core::arch::wasm32::{
4544        u32x4_all_true, u32x4_eq, u32x4_shuffle, u32x4_splat, v128, v128_load, v128_store,
4545    };
4546    debug_assert!(cells.len() >= 4);
4547    let base = cells.as_ptr() as *const v128;
4548    let v0 = unsafe { v128_load(base) }; // [p0 g0 p1 g1]
4549    let v1 = unsafe { v128_load(base.add(1)) }; // [p2 g2 p3 g3]
4550    // Lanes 0..3 index v0, 4..7 index v1.
4551    let gens = u32x4_shuffle::<1, 3, 5, 7>(v0, v1); // [g0 g1 g2 g3]
4552    let eq = u32x4_eq(gens, u32x4_splat(stamp));
4553    if !u32x4_all_true(eq) {
4554        return None;
4555    }
4556    let prices = u32x4_shuffle::<0, 2, 4, 6>(v0, v1); // [p0 p1 p2 p3]
4557    let mut out = [0u32; 4];
4558    unsafe { v128_store(out.as_mut_ptr() as *mut v128, prices) };
4559    Some(out)
4560}
4561
4562/// wasm `simd128` 4-lane `next_cost < node_price` bitmask. wasm has a native
4563/// unsigned compare (`u32x4_lt`) and `u32x4_bitmask` to pack the lanes.
4564#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
4565#[target_feature(enable = "simd128")]
4566#[inline]
4567unsafe fn priceset_improved_mask4_simd128(next_cost: &[u32; 4], node_price: &[u32]) -> u8 {
4568    use core::arch::wasm32::{u32x4_bitmask, u32x4_lt, v128, v128_load};
4569    let nc = unsafe { v128_load(next_cost.as_ptr() as *const v128) };
4570    let np = unsafe { v128_load(node_price.as_ptr() as *const v128) };
4571    u32x4_bitmask(u32x4_lt(nc, np))
4572}
4573
4574#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
4575#[target_feature(enable = "simd128")]
4576#[inline]
4577#[allow(clippy::too_many_arguments)]
4578unsafe fn priceset_range_nonabort_simd128(
4579    node_prices: &mut [u32],
4580    nodes: &mut [HcOptimalNode],
4581    ml_cache: &mut [[u32; 2]],
4582    ml_stamp: u32,
4583    profile: HcOptimalCostProfile,
4584    stats: &HcOptState,
4585    pos: usize,
4586    start: usize,
4587    max: usize,
4588    ll0_price: u32,
4589    off_price: u32,
4590    base_cost: u32,
4591    off: u32,
4592    reps: [u32; 3],
4593    last_pos: usize,
4594) -> usize {
4595    priceset_range_vec::<4>(
4596        node_prices,
4597        nodes,
4598        ml_cache,
4599        ml_stamp,
4600        profile,
4601        stats,
4602        pos,
4603        start,
4604        max,
4605        ll0_price,
4606        off_price,
4607        base_cost,
4608        off,
4609        reps,
4610        last_pos,
4611        // SAFETY: both closures run inside this fn's simd128 target_feature umbrella.
4612        |cells, stamp| unsafe { priceset_cached_prices4_simd128(cells, stamp) },
4613        |nc, np| unsafe { priceset_improved_mask4_simd128(nc, np) },
4614    )
4615}
4616
4617macro_rules! build_optimal_plan_impl_body {
4618    (
4619        $self:expr,
4620        $strategy_ty:ty,
4621        $current:ident,
4622        $current_abs_start:ident,
4623        $current_len:ident,
4624        $initial_state:ident,
4625        $stats:ident,
4626        $out:ident,
4627        $collect:ident,
4628        $priceset:path $(,)?
4629    ) => {{
4630        let current_abs_end = $current_abs_start + $current_len;
4631        let min_match_len = HC_OPT_MIN_MATCH_LEN;
4632        // `HC_OPT_NUM > 0` by const definition, so `HC_OPT_NUM - 1` is safe.
4633        let frontier_limit = $current_len.min(HC_OPT_NUM - 1);
4634        let initial_reps = $initial_state.reps;
4635        let initial_litlen = $initial_state.litlen;
4636        let ldm_block_offset = $initial_state.block_offset;
4637        let mut profile = $initial_state.profile;
4638        profile.sufficient_match_len = $self.hc.sufficient_match_len_for_pass(profile);
4639        // Const-fold from the strategy's associated `OPT_LEVEL`
4640        // (upstream zstd `optLevel`): BtOpt = 0, BtUltra / BtUltra2 = 2.
4641        // The two flags below are the only places the inner DP loop
4642        // used to consult `parse_mode`; lifting them into const
4643        // expressions drops one indirect read + one branch on every
4644        // candidate insertion and every traceback step.
4645        // `let` (not `const`) — nested `const` items inside a
4646        // generic fn cannot project through the outer fn's type
4647        // parameter, but a `let` binding from a const expression
4648        // does get folded by the optimiser per monomorphisation,
4649        // which is what we actually want here.
4650        debug_assert!(
4651            <$strategy_ty as super::strategy::Strategy>::USE_BT,
4652            "build_optimal_plan_impl_body called on non-BT strategy"
4653        );
4654        let abort_on_worse_match: bool =
4655            <$strategy_ty as super::strategy::Strategy>::OPT_LEVEL == 0;
4656        let opt_level: bool = <$strategy_ty as super::strategy::Strategy>::OPT_LEVEL >= 2;
4657        let mut nodes = core::mem::take(&mut $self.backend.bt_mut().opt_nodes_scratch);
4658        let mut node_prices = core::mem::take(&mut $self.backend.bt_mut().opt_node_prices_scratch);
4659        // `frontier_limit + 2 <= HC_OPT_NODE_LEN` — bounded by const.
4660        let frontier_buffer_size = frontier_limit + 2;
4661        if nodes.len() < HC_OPT_NODE_LEN {
4662            // First optimal-parse use (empty boxed slice) or an undersized
4663            // buffer: allocate the fixed upstream-zstd-sized frontier once. The DP
4664            // overwrites the active prefix before reading it.
4665            nodes = alloc::vec![HcOptimalNode::default(); HC_OPT_NODE_LEN].into_boxed_slice();
4666        }
4667        // The DP price array, same fixed length as `nodes`. This is the SOLE
4668        // home of each position's price (the node struct carries no price), so
4669        // the SIMD price-set vector-loads it directly. Initialised to u32::MAX
4670        // so unwritten frontier cells compare as "unreachable".
4671        if node_prices.len() < HC_OPT_NODE_LEN {
4672            node_prices = alloc::vec![u32::MAX; HC_OPT_NODE_LEN].into_boxed_slice();
4673        }
4674        let mut candidates = core::mem::take(&mut $self.backend.bt_mut().opt_candidates_scratch);
4675        candidates.clear();
4676        if candidates.capacity() < MAX_HC_SEARCH_DEPTH {
4677            candidates.reserve_exact(MAX_HC_SEARCH_DEPTH - candidates.capacity());
4678        }
4679        let mut store = core::mem::take(&mut $self.backend.bt_mut().opt_store_scratch);
4680        store.clear();
4681        let mut price_arena = core::mem::take(&mut $self.backend.bt_mut().opt_price_arena);
4682        if price_arena.len() < HC_OPT_PRICE_ARENA_LEN {
4683            price_arena = alloc::vec![[0u32; 2]; HC_OPT_PRICE_ARENA_LEN].into_boxed_slice();
4684        }
4685        // Single arena → two disjoint fixed-stride regions of `[price,
4686        // generation]` pairs (LL cache, ML cache): one base pointer + fixed
4687        // offsets, mirroring upstream zstd's single opt workspace. Pairing
4688        // price+generation per code keeps the optimal parser's cache probe
4689        // on ONE line instead of two strided regions.
4690        // SAFETY: `price_arena` is exactly `HC_OPT_PRICE_ARENA_LEN =
4691        // 2 * HC_OPT_PRICE_STRIDE` pairs long (just ensured), so the two
4692        // STRIDE-wide regions are in bounds and disjoint. The slices alias
4693        // the heap buffer `price_arena` owns; that heap address is stable
4694        // across the later move of the `price_arena` box into the result
4695        // bundle (a `Box` move relocates only the pointer, not the heap
4696        // data), and the slices are never used after the bundle is
4697        // constructed. The fixed STRIDE (independent of `frontier_limit`)
4698        // keeps every code's cell at a constant offset so the monotonic
4699        // stamps stay valid across calls with different frontiers.
4700        let arena_base = price_arena.as_mut_ptr();
4701        let mut ll_cache: &mut [[u32; 2]] =
4702            unsafe { core::slice::from_raw_parts_mut(arena_base, HC_OPT_PRICE_STRIDE) };
4703        let mut ml_cache: &mut [[u32; 2]] = unsafe {
4704            core::slice::from_raw_parts_mut(arena_base.add(HC_OPT_PRICE_STRIDE), HC_OPT_PRICE_STRIDE)
4705        };
4706        $self.backend.bt_mut().opt_ll_price_stamp = $self
4707            .backend
4708            .bt_mut()
4709            .opt_ll_price_stamp
4710            .wrapping_add(1)
4711            .max(1);
4712        let ll_price_stamp = $self.backend.bt_mut().opt_ll_price_stamp;
4713        $self.backend.bt_mut().opt_lit_price_stamp = $self
4714            .backend
4715            .bt_mut()
4716            .opt_lit_price_stamp
4717            .wrapping_add(1)
4718            .max(1);
4719        let lit_price_stamp = $self.backend.bt_mut().opt_lit_price_stamp;
4720        $self.backend.bt_mut().opt_ml_price_stamp = $self
4721            .backend
4722            .bt_mut()
4723            .opt_ml_price_stamp
4724            .wrapping_add(1)
4725            .max(1);
4726        let ml_price_stamp = $self.backend.bt_mut().opt_ml_price_stamp;
4727        let node0_price = BtMatcher::cached_lit_length_price(
4728            profile,
4729            $stats,
4730            initial_litlen,
4731            &mut ll_cache,
4732            ll_price_stamp,
4733        );
4734        nodes[0] = HcOptimalNode {
4735            litlen: initial_litlen as u32,
4736            reps: initial_reps,
4737            ..HcOptimalNode::default()
4738        };
4739        node_prices[0] = node0_price;
4740        let sufficient_len = profile.sufficient_match_len;
4741        let ll0_price = BtMatcher::cached_lit_length_price(
4742            profile,
4743            $stats,
4744            0,
4745            &mut ll_cache,
4746            ll_price_stamp,
4747        );
4748        let ll1_price = BtMatcher::cached_lit_length_price(
4749            profile,
4750            $stats,
4751            1,
4752            &mut ll_cache,
4753            ll_price_stamp,
4754        );
4755        let mut pos = 1usize;
4756        let mut last_pos = 0usize;
4757        let mut forced_end: Option<usize> = None;
4758        let mut forced_end_state: Option<HcOptimalNode> = None;
4759        // Price companion of `forced_end_state` (price no longer lives in the
4760        // node struct; tracked alongside the forced-seed node).
4761        let mut forced_end_price: Option<u32> = None;
4762        let mut seed_forced_shortest_path = false;
4763        let mut opt_ldm = HcOptLdmState {
4764            seq_store: HcRawSeqStore {
4765                pos: 0,
4766                pos_in_sequence: 0,
4767                size: $self.backend.bt_mut().ldm_sequences.len(),
4768            },
4769            ..HcOptLdmState::default()
4770        };
4771        let has_ldm = !$self.backend.bt_mut().ldm_sequences.is_empty();
4772        if has_ldm {
4773            // `ldm_sequences` are emitted in BLOCK-relative coordinates,
4774            // but this optimal-parser pass runs over a SEGMENT of the
4775            // block starting at block-offset `$block_offset` and uses
4776            // segment-relative positions throughout. Fast-forward the raw
4777            // seq-store cursor past the bytes covered by earlier segments
4778            // so the (segment-relative) LDM windows below land at the
4779            // correct positions. Idempotent: `ldm_skip_raw_seq_store_bytes`
4780            // recomputes from `pos = 0`, so re-running it per segment is
4781            // safe. Without this, every segment after the first re-applied
4782            // the block's leading LDM windows at the wrong offset, emitting
4783            // matches that copy the wrong bytes (undecodable frame).
4784            if ldm_block_offset > 0 {
4785                $self
4786                    .backend
4787                    .bt_mut()
4788                    .ldm_skip_raw_seq_store_bytes(&mut opt_ldm.seq_store, ldm_block_offset);
4789            }
4790            $self
4791                .backend
4792                .bt_mut()
4793                .ldm_get_next_match_and_update_seq_store(&mut opt_ldm, 0, $current_len);
4794        }
4795
4796        // Upstream zstd-like seed at rPos=0: initialize frontier with matches starting
4797        // at current position before entering the generic forward DP loop.
4798        if $current_len >= min_match_len {
4799            let seed_ldm = if has_ldm {
4800                $self.backend.bt_mut().ldm_process_match_candidate(
4801                    &mut opt_ldm,
4802                    0,
4803                    $current_len,
4804                    min_match_len,
4805                )
4806            } else {
4807                None
4808            };
4809            candidates.clear();
4810            // SAFETY: wrapper is in the same target_feature umbrella as the
4811            // `$collect` kernel variant; the runtime kernel detector already
4812            // gated entry into the wrapper.
4813            unsafe {
4814                $self.$collect::<$strategy_ty, true>(
4815                    $current_abs_start,
4816                    current_abs_end,
4817                    profile,
4818                    HcCandidateQuery {
4819                        reps: initial_reps,
4820                        lit_len: initial_litlen,
4821                        ldm_candidate: seed_ldm,
4822                    },
4823                    &mut candidates,
4824                )
4825            };
4826            if !candidates.is_empty() {
4827                // `min_match_len >= HC_FORMAT_MINMATCH (3)` by invariant.
4828                last_pos = (min_match_len - 1).min(frontier_limit);
4829                for p in 1..min_match_len.min(frontier_buffer_size) {
4830                    BtMatcher::reset_opt_node(&mut nodes[p]);
4831                    // Reset the price (sole home; the node carries none).
4832                    node_prices[p] = u32::MAX;
4833                    // `initial_litlen` is the litlen carried from prior
4834                    // optimal-plan segments — its real bound is the
4835                    // current block length (the frame compressor caps
4836                    // block scan at `HC_BLOCKSIZE_MAX`), not the segment
4837                    // `current_len`. `p < min_match_len` (small constant),
4838                    // so the sum stays well within `u32::MAX`. Use
4839                    // `checked_add` FIRST so the `usize` addition itself
4840                    // cannot overflow on i686 (where `usize` is 32-bit
4841                    // and a wrapping `+` would slip past `try_from`).
4842                    let seed_litlen = initial_litlen
4843                        .checked_add(p)
4844                        .and_then(|s| u32::try_from(s).ok())
4845                        .expect("optimal parser seed litlen out of u32 range");
4846                    nodes[p].litlen = seed_litlen;
4847                }
4848            }
4849
4850            if let Some(candidate) = candidates.last() {
4851                let longest_len = candidate.match_len.min($current_len);
4852                if longest_len > sufficient_len {
4853                    let off_base = BtMatcher::encode_offset_base_with_reps(
4854                        candidate.offset as u32,
4855                        initial_litlen,
4856                        initial_reps,
4857                    );
4858                    let off_price = profile
4859                        .offset_price_for::<ACCURATE_PRICE, FAVOR_SMALL_OFFSETS>($stats, off_base);
4860                    let ml_price = BtMatcher::cached_match_length_price(
4861                        profile,
4862                        $stats,
4863                        longest_len,
4864                        &mut ml_cache,
4865                        ml_price_stamp,
4866                    );
4867                    let seq_cost = BtMatcher::add_prices(
4868                        ll0_price,
4869                        profile.match_price_from_parts(off_price, ml_price, $stats),
4870                    );
4871                    let forced_price = BtMatcher::add_prices(node_prices[0], seq_cost);
4872                    let forced_state = HcOptimalNode {
4873                        off: candidate.offset as u32,
4874                        mlen: longest_len as u32,
4875                        litlen: 0,
4876                        reps: initial_reps,
4877                    };
4878                    if longest_len < frontier_buffer_size && forced_price < node_prices[longest_len] {
4879                        nodes[longest_len] = forced_state;
4880                        node_prices[longest_len] = forced_price;
4881                    }
4882                    forced_end = Some(longest_len);
4883                    forced_end_state = Some(forced_state);
4884                    forced_end_price = Some(forced_price);
4885                    seed_forced_shortest_path = true;
4886                }
4887            }
4888            if !seed_forced_shortest_path {
4889                let mut prev_max_len = min_match_len - 1;
4890                for candidate in candidates.iter() {
4891                    let max_match_len = candidate.match_len.min(frontier_limit);
4892                    if max_match_len < min_match_len {
4893                        continue;
4894                    }
4895                    let start_len = (prev_max_len + 1).max(min_match_len);
4896                    if start_len > max_match_len {
4897                        prev_max_len = prev_max_len.max(max_match_len);
4898                        continue;
4899                    }
4900                    if max_match_len > last_pos {
4901                        BtMatcher::reset_opt_nodes(
4902                            &mut nodes,
4903                            &mut node_prices,
4904                            last_pos + 1,
4905                            max_match_len,
4906                        );
4907                    }
4908                    let off_base = BtMatcher::encode_offset_base_with_reps(
4909                        candidate.offset as u32,
4910                        initial_litlen,
4911                        initial_reps,
4912                    );
4913                    let off_price = profile
4914                        .offset_price_for::<ACCURATE_PRICE, FAVOR_SMALL_OFFSETS>($stats, off_base);
4915                    debug_assert!(max_match_len < frontier_buffer_size);
4916                    let nodes0_price = node_prices[0];
4917                    for match_len in (start_len..=max_match_len).rev() {
4918                        let ml_price = BtMatcher::cached_match_length_price(
4919                            profile,
4920                            $stats,
4921                            match_len,
4922                            &mut ml_cache,
4923                            ml_price_stamp,
4924                        );
4925                        let seq_cost = BtMatcher::add_prices(
4926                            ll0_price,
4927                            profile.match_price_from_parts(off_price, ml_price, $stats),
4928                        );
4929                        let next_cost = BtMatcher::add_prices(nodes0_price, seq_cost);
4930                        let node_price = unsafe { *node_prices.get_unchecked(match_len) };
4931                        if match_len > last_pos || next_cost < node_price {
4932                            let slot = unsafe { nodes.get_unchecked_mut(match_len) };
4933                            *slot = HcOptimalNode {
4934                                off: candidate.offset as u32,
4935                                mlen: match_len as u32,
4936                                litlen: 0,
4937                                reps: initial_reps,
4938                            };
4939                            unsafe { *node_prices.get_unchecked_mut(match_len) = next_cost };
4940                            if match_len > last_pos {
4941                                last_pos = match_len;
4942                            }
4943                        } else if abort_on_worse_match {
4944                            break;
4945                        }
4946                    }
4947                    prev_max_len = prev_max_len.max(max_match_len);
4948                }
4949                if last_pos + 1 < frontier_buffer_size {
4950                    node_prices[last_pos + 1] = u32::MAX;
4951                }
4952            }
4953        }
4954        while !seed_forced_shortest_path && pos <= last_pos && pos <= frontier_limit {
4955            debug_assert!(pos + 1 < frontier_buffer_size);
4956            let prev_node = unsafe { *nodes.get_unchecked(pos - 1) };
4957            let prev_node_price = unsafe { *node_prices.get_unchecked(pos - 1) };
4958            if prev_node_price != u32::MAX {
4959                let lit_len = prev_node.litlen as usize + 1;
4960                let lit_price = {
4961                    let bt = $self.backend.bt_mut();
4962                    BtMatcher::cached_literal_price(
4963                        profile,
4964                        $stats,
4965                        $current[pos - 1],
4966                        &mut bt.opt_lit_price_scratch,
4967                        &mut bt.opt_lit_price_generation,
4968                        lit_price_stamp,
4969                    )
4970                };
4971                let ll_delta = BtMatcher::cached_lit_length_delta_price(
4972                    profile,
4973                    $stats,
4974                    lit_len,
4975                    &mut ll_cache,
4976                    ll_price_stamp,
4977                );
4978                let lit_cost = BtMatcher::add_price_delta(prev_node_price, lit_price, ll_delta);
4979                // `node_pos_price` is the OLD price at `pos` (before the write
4980                // below) — also the price of `prev_match`, the pre-overwrite copy.
4981                let node_pos_price = unsafe { *node_prices.get_unchecked(pos) };
4982                if lit_cost <= node_pos_price {
4983                    let prev_match = unsafe { *nodes.get_unchecked(pos) };
4984                    let slot = unsafe { nodes.get_unchecked_mut(pos) };
4985                    *slot = prev_node;
4986                    slot.litlen = lit_len as u32;
4987                    node_prices[pos] = lit_cost;
4988                    #[allow(clippy::collapsible_if)]
4989                    if opt_level
4990                        && prev_match.mlen > 0
4991                        && prev_match.litlen == 0
4992                        && pos < $current_len
4993                    {
4994                        if ll1_price < ll0_price {
4995                            let next_lit_price = {
4996                                let bt = $self.backend.bt_mut();
4997                                BtMatcher::cached_literal_price(
4998                                    profile,
4999                                    $stats,
5000                                    $current[pos],
5001                                    &mut bt.opt_lit_price_scratch,
5002                                    &mut bt.opt_lit_price_generation,
5003                                    lit_price_stamp,
5004                                )
5005                            };
5006                            let with1literal = BtMatcher::add_price_delta(
5007                                node_pos_price,
5008                                next_lit_price,
5009                                ll1_price as i32 - ll0_price as i32,
5010                            );
5011                            let ll_delta_next = BtMatcher::cached_lit_length_delta_price(
5012                                profile,
5013                                $stats,
5014                                lit_len + 1,
5015                                &mut ll_cache,
5016                                ll_price_stamp,
5017                            );
5018                            let with_more_literals =
5019                                BtMatcher::add_price_delta(lit_cost, next_lit_price, ll_delta_next);
5020                            let next = pos + 1;
5021                            let next_price = unsafe { *node_prices.get_unchecked(next) };
5022                            if with1literal < with_more_literals && with1literal < next_price {
5023                                // Upstream zstd parity (zstd_opt.c:1232): `cur >= prevMatch.mlen`.
5024                                debug_assert!(pos >= prev_match.mlen as usize);
5025                                let prev_pos = pos - prev_match.mlen as usize;
5026                                {
5027                                    let prev_state = unsafe { *nodes.get_unchecked(prev_pos) };
5028                                    let (_, reps_after_match) = BtMatcher::encode_offset_with_reps(
5029                                        prev_match.off,
5030                                        prev_state.litlen as usize,
5031                                        prev_state.reps,
5032                                    );
5033                                    let slot = unsafe { nodes.get_unchecked_mut(next) };
5034                                    *slot = prev_match;
5035                                    slot.reps = reps_after_match;
5036                                    slot.litlen = 1;
5037                                    node_prices[next] = with1literal;
5038                                    if next > last_pos {
5039                                        last_pos = next;
5040                                    }
5041                                }
5042                            }
5043                        }
5044                    }
5045                }
5046            }
5047
5048            // Memory-resident DP (upstream zstd parity): read opt[cur] fields on
5049            // demand instead of holding a 28-byte node copy live across the
5050            // per-position `$collect` call below. The held copy forced LLVM
5051            // to spill reps[3] + litlen around the (non-inlinable) call;
5052            // reading the fields fresh on each side keeps them out of the
5053            // cross-call live set. `nodes[pos]` is stable across `$collect`
5054            // (it only fills `candidates`), so post-call reads are identical.
5055            let base_cost = unsafe { *node_prices.get_unchecked(pos) };
5056            if base_cost == u32::MAX {
5057                pos += 1;
5058                continue;
5059            }
5060            {
5061                let base_node = unsafe { *nodes.get_unchecked(pos) };
5062                if base_node.mlen > 0 && base_node.litlen == 0 {
5063                    // Upstream zstd parity (zstd_opt.c:1255): `cur >= opt[cur].mlen`.
5064                    debug_assert!(pos >= base_node.mlen as usize);
5065                    let prev_pos = pos - base_node.mlen as usize;
5066                    let prev_state = unsafe { *nodes.get_unchecked(prev_pos) };
5067                    let (_, reps_after_match) = BtMatcher::encode_offset_with_reps(
5068                        base_node.off,
5069                        prev_state.litlen as usize,
5070                        prev_state.reps,
5071                    );
5072                    unsafe { nodes.get_unchecked_mut(pos).reps = reps_after_match };
5073                }
5074            }
5075
5076            if pos + 8 > $current_len {
5077                pos += 1;
5078                continue;
5079            }
5080
5081            if pos == last_pos {
5082                break;
5083            }
5084
5085            let next_price = unsafe { *node_prices.get_unchecked(pos + 1) };
5086            // `saturating_add` is REQUIRED here, not a masked bug: `base_cost`
5087            // is a node price that can be the `u32::MAX` "unreachable" sentinel,
5088            // and saturating keeps `base_cost + margin` pinned at MAX so the
5089            // comparison stays correct. Plain `+` would wrap the sentinel and
5090            // flip the abort decision (a ratio bug / debug overflow panic).
5091            if abort_on_worse_match
5092                && next_price <= base_cost.saturating_add(HC_BITCOST_MULTIPLIER / 2)
5093            {
5094                pos += 1;
5095                continue;
5096            }
5097
5098            let abs_pos = $current_abs_start + pos;
5099            let ldm_candidate = if has_ldm {
5100                $self.backend.bt_mut().ldm_process_match_candidate(
5101                    &mut opt_ldm,
5102                    pos,
5103                    $current_len - pos,
5104                    min_match_len,
5105                )
5106            } else {
5107                None
5108            };
5109            candidates.clear();
5110            // SAFETY: same umbrella as `$collect`. Query fields are read
5111            // fresh here (consumed into the call's argument) so they do not
5112            // stay live across the call; the post-call reads below are a
5113            // separate, fresh load of the same stable `nodes[pos]`.
5114            unsafe {
5115                $self.$collect::<$strategy_ty, true>(
5116                    abs_pos,
5117                    current_abs_end,
5118                    profile,
5119                    HcCandidateQuery {
5120                        reps: nodes.get_unchecked(pos).reps,
5121                        lit_len: nodes.get_unchecked(pos).litlen as usize,
5122                        ldm_candidate,
5123                    },
5124                    &mut candidates,
5125                )
5126            };
5127            // Post-call reads of opt[cur]: fresh, born after `$collect`, so
5128            // never part of the cross-call live set (see memory-resident note
5129            // above). `nodes[pos]` is untouched by `$collect`.
5130            let base_reps = unsafe { nodes.get_unchecked(pos).reps };
5131            let base_litlen = unsafe { nodes.get_unchecked(pos).litlen as usize };
5132            if let Some(candidate) = candidates.last() {
5133                let longest_len = candidate.match_len.min($current_len - pos);
5134                if longest_len > sufficient_len
5135                    || pos + longest_len >= HC_OPT_NUM
5136                    || pos + longest_len >= $current_len
5137                {
5138                    let lit_len = base_litlen;
5139                    let off_base = BtMatcher::encode_offset_base_with_reps(
5140                        candidate.offset as u32,
5141                        lit_len,
5142                        base_reps,
5143                    );
5144                    let off_price = profile
5145                        .offset_price_for::<ACCURATE_PRICE, FAVOR_SMALL_OFFSETS>($stats, off_base);
5146                    let ml_price = BtMatcher::cached_match_length_price(
5147                        profile,
5148                        $stats,
5149                        longest_len,
5150                        &mut ml_cache,
5151                        ml_price_stamp,
5152                    );
5153                    let seq_cost = BtMatcher::add_prices(
5154                        ll0_price,
5155                        profile.match_price_from_parts(off_price, ml_price, $stats),
5156                    );
5157                    let forced_price = BtMatcher::add_prices(base_cost, seq_cost);
5158                    let end_pos = (pos + longest_len).min($current_len);
5159                    forced_end = Some(end_pos);
5160                    forced_end_state = Some(HcOptimalNode {
5161                        off: candidate.offset as u32,
5162                        mlen: longest_len as u32,
5163                        litlen: 0,
5164                        reps: base_reps,
5165                    });
5166                    forced_end_price = Some(forced_price);
5167                    break;
5168                }
5169            }
5170            let mut prev_max_len = min_match_len - 1;
5171            for candidate in candidates.iter() {
5172                // Outer loop guards `pos <= frontier_limit` (see the
5173                // `while ... pos <= frontier_limit` condition); the
5174                // subtraction below is therefore safe.
5175                debug_assert!(pos <= frontier_limit);
5176                let max_match_len = candidate
5177                    .match_len
5178                    .min($current_len - pos)
5179                    .min(frontier_limit - pos);
5180                let min_len = min_match_len;
5181                if max_match_len < min_len {
5182                    continue;
5183                }
5184                let start_len = (prev_max_len + 1).max(min_len);
5185                if start_len > max_match_len {
5186                    prev_max_len = prev_max_len.max(max_match_len);
5187                    continue;
5188                }
5189                let max_next = pos + max_match_len;
5190                if max_next > last_pos {
5191                    BtMatcher::reset_opt_nodes(
5192                        &mut nodes,
5193                        &mut node_prices,
5194                        last_pos + 1,
5195                        max_next,
5196                    );
5197                }
5198                let lit_len = base_litlen;
5199                let off_base = BtMatcher::encode_offset_base_with_reps(
5200                    candidate.offset as u32,
5201                    lit_len,
5202                    base_reps,
5203                );
5204                let off_price = profile
5205                    .offset_price_for::<ACCURATE_PRICE, FAVOR_SMALL_OFFSETS>($stats, off_base);
5206                debug_assert!(pos + max_match_len < frontier_buffer_size);
5207                if abort_on_worse_match {
5208                    // btopt (OPT_LEVEL == 0): reverse-iterate with early break —
5209                    // once a longer match stops improving, shorter ones are
5210                    // skipped. Order-dependent, stays scalar.
5211                    for match_len in (start_len..=max_match_len).rev() {
5212                        let next = pos + match_len;
5213                        let ml_price = BtMatcher::cached_match_length_price(
5214                            profile,
5215                            $stats,
5216                            match_len,
5217                            &mut ml_cache,
5218                            ml_price_stamp,
5219                        );
5220                        let seq_cost = BtMatcher::add_prices(
5221                            ll0_price,
5222                            profile.match_price_from_parts(off_price, ml_price, $stats),
5223                        );
5224                        let next_cost = BtMatcher::add_prices(base_cost, seq_cost);
5225                        let node_next_price = unsafe { *node_prices.get_unchecked(next) };
5226                        if next > last_pos || next_cost < node_next_price {
5227                            let slot = unsafe { nodes.get_unchecked_mut(next) };
5228                            *slot = HcOptimalNode {
5229                                off: candidate.offset as u32,
5230                                mlen: match_len as u32,
5231                                litlen: 0,
5232                                reps: base_reps,
5233                            };
5234                            unsafe { *node_prices.get_unchecked_mut(next) = next_cost };
5235                            if next > last_pos {
5236                                last_pos = next;
5237                            }
5238                        } else {
5239                            break;
5240                        }
5241                    }
5242                } else {
5243                    // btultra / btultra2 (OPT_LEVEL >= 2): no abort, each
5244                    // match_len writes a distinct node => order-independent.
5245                    // Dispatch to the per-tier price-set ($priceset is the
5246                    // tier's fn: AVX2 SoA-vector compare for the avx2 wrapper,
5247                    // inline scalar otherwise) — it folds into this wrapper's
5248                    // monomorphisation, so no call ABI / runtime feature check.
5249                    #[allow(unused_unsafe)]
5250                    {
5251                        last_pos = last_pos.max(unsafe {
5252                            $priceset(
5253                                &mut node_prices,
5254                                &mut nodes,
5255                                ml_cache,
5256                                ml_price_stamp,
5257                                profile,
5258                                $stats,
5259                                pos,
5260                                start_len,
5261                                max_match_len,
5262                                ll0_price,
5263                                off_price,
5264                                base_cost,
5265                                candidate.offset as u32,
5266                                base_reps,
5267                                last_pos,
5268                            )
5269                        });
5270                    }
5271                }
5272                prev_max_len = prev_max_len.max(max_match_len);
5273            }
5274
5275            if last_pos + 1 < frontier_buffer_size {
5276                unsafe {
5277                    *node_prices.get_unchecked_mut(last_pos + 1) = u32::MAX;
5278                }
5279            }
5280            pos += 1;
5281        }
5282
5283        if last_pos == 0 {
5284            if $current_len == 0 {
5285                let price = node_prices[0];
5286                return $self.backend.bt_mut().finish_optimal_plan(
5287                    HcOptimalPlanBuffers {
5288                        nodes,
5289                        node_prices,
5290                        candidates,
5291                        store,
5292                        price_arena,
5293                    },
5294                    (price, initial_reps, initial_litlen, 0),
5295                );
5296            }
5297            let lit_price = {
5298                let bt = $self.backend.bt_mut();
5299                BtMatcher::cached_literal_price(
5300                    profile,
5301                    $stats,
5302                    $current[0],
5303                    &mut bt.opt_lit_price_scratch,
5304                    &mut bt.opt_lit_price_generation,
5305                    lit_price_stamp,
5306                )
5307            };
5308            // `initial_litlen` is carried across optimal-plan segments;
5309            // its real bound is the current block length, not
5310            // `current_len`. On i686 (32-bit `usize`) `+ 1` could
5311            // theoretically wrap if the invariant ever broke. Catch
5312            // that explicitly via `checked_add` rather than letting a
5313            // wrapping sum slip into the price lookup.
5314            let next_litlen = initial_litlen
5315                .checked_add(1)
5316                .expect("optimal parser next litlen out of usize range");
5317            let ll_delta = BtMatcher::cached_lit_length_delta_price(
5318                profile,
5319                $stats,
5320                next_litlen,
5321                &mut ll_cache,
5322                ll_price_stamp,
5323            );
5324            let price = BtMatcher::add_price_delta(node_prices[0], lit_price, ll_delta);
5325            return $self.backend.bt_mut().finish_optimal_plan(
5326                HcOptimalPlanBuffers {
5327                    nodes,
5328                    node_prices,
5329                    candidates,
5330                    store,
5331                    price_arena,
5332                },
5333                (price, initial_reps, next_litlen, 1),
5334            );
5335        }
5336
5337        let target_pos = forced_end.unwrap_or(last_pos.min(frontier_limit));
5338        // Price lives in `node_prices`, not the node struct, so carry the
5339        // final-stretch price alongside its node (forced-seed companion or the
5340        // frontier price at `target_pos`).
5341        let (last_stretch, last_stretch_price) = if let Some(forced_state) = forced_end_state {
5342            (forced_state, forced_end_price.expect("forced state has a price"))
5343        } else {
5344            (nodes[target_pos], node_prices[target_pos])
5345        };
5346        if last_stretch_price == u32::MAX {
5347            return $self.backend.bt_mut().finish_optimal_plan(
5348                HcOptimalPlanBuffers {
5349                    nodes,
5350                    node_prices,
5351                    candidates,
5352                    store,
5353                    price_arena,
5354                },
5355                (u32::MAX, initial_reps, initial_litlen, $current_len),
5356            );
5357        }
5358
5359        if last_stretch.mlen == 0 {
5360            return $self.backend.bt_mut().finish_optimal_plan(
5361                HcOptimalPlanBuffers {
5362                    nodes,
5363                    node_prices,
5364                    candidates,
5365                    store,
5366                    price_arena,
5367                },
5368                (
5369                    last_stretch_price,
5370                    last_stretch.reps,
5371                    last_stretch.litlen as usize,
5372                    target_pos.min($current_len),
5373                ),
5374            );
5375        }
5376
5377        let mut cur = target_pos.saturating_sub(last_stretch.mlen as usize);
5378        let end_reps = if last_stretch.litlen == 0 {
5379            let prev_state = nodes[cur];
5380            let (_, reps_after_match) = BtMatcher::encode_offset_with_reps(
5381                last_stretch.off,
5382                prev_state.litlen as usize,
5383                prev_state.reps,
5384            );
5385            reps_after_match
5386        } else {
5387            let tail_literals = last_stretch.litlen as usize;
5388            if cur < tail_literals {
5389                return $self.backend.bt_mut().finish_optimal_plan(
5390                    HcOptimalPlanBuffers {
5391                        nodes,
5392                        node_prices,
5393                        candidates,
5394                        store,
5395                        price_arena,
5396                    },
5397                    (
5398                        last_stretch_price,
5399                        last_stretch.reps,
5400                        tail_literals,
5401                        target_pos.min($current_len),
5402                    ),
5403                );
5404            }
5405            cur -= tail_literals;
5406            last_stretch.reps
5407        };
5408        let store_end = cur + 2;
5409        if store.len() <= store_end {
5410            store.resize(store_end + 1, HcOptimalNode::default());
5411        }
5412        let mut store_start;
5413        let mut stretch_pos = cur;
5414
5415        if last_stretch.litlen > 0 {
5416            store[store_end] = HcOptimalNode {
5417                litlen: last_stretch.litlen,
5418                mlen: 0,
5419                ..HcOptimalNode::default()
5420            };
5421            store_start = store_end.saturating_sub(1);
5422            store[store_start] = last_stretch;
5423        }
5424        store[store_end] = last_stretch;
5425        store_start = store_end;
5426
5427        loop {
5428            let next_stretch = nodes[stretch_pos];
5429            store[store_start].litlen = next_stretch.litlen;
5430            if next_stretch.mlen == 0 {
5431                break;
5432            }
5433            if store_start == 0 {
5434                break;
5435            }
5436            store_start -= 1;
5437            store[store_start] = next_stretch;
5438            // Parser invariant: every emitted stretch is bounded by the
5439            // current block, so `litlen + mlen <= current_len <=
5440            // HC_BLOCKSIZE_MAX (128 KiB)`. The `as usize` widening + raw
5441            // `+` is safe on 32-bit targets — two u32 values do NOT
5442            // automatically fit in `usize` on i686, the block bound is
5443            // what makes this addition safe.
5444            let litlen = next_stretch.litlen as usize;
5445            let mlen = next_stretch.mlen as usize;
5446            debug_assert!(litlen + mlen <= $current_len);
5447            let step = litlen + mlen;
5448            if step == 0 || stretch_pos < step {
5449                break;
5450            }
5451            stretch_pos -= step;
5452        }
5453
5454        let mut tail_literals = initial_litlen;
5455        let mut store_pos = store_start;
5456        while store_pos <= store_end {
5457            let stretch = store[store_pos];
5458            let llen = stretch.litlen as usize;
5459            let mlen = stretch.mlen as usize;
5460            if mlen == 0 {
5461                tail_literals = llen;
5462                store_pos += 1;
5463                continue;
5464            }
5465            $out.push(HcOptimalSequence {
5466                offset: stretch.off,
5467                match_len: mlen as u32,
5468                lit_len: llen as u32,
5469            });
5470            tail_literals = 0;
5471            store_pos += 1;
5472        }
5473        let result = (
5474            last_stretch_price,
5475            end_reps,
5476            if last_stretch.litlen > 0 {
5477                last_stretch.litlen as usize
5478            } else {
5479                tail_literals
5480            },
5481            target_pos.min($current_len),
5482        );
5483        $self.backend.bt_mut().finish_optimal_plan(
5484            HcOptimalPlanBuffers {
5485                nodes,
5486                node_prices,
5487                candidates,
5488                store,
5489                price_arena,
5490            },
5491            result,
5492        )
5493    }};
5494}
5495
5496/// `collect_optimal_candidates_initialized` body parameterized over the per-CPU
5497/// kernel: the `$cpl` path is the kernel's `common_prefix_len_ptr` (used in
5498/// the HC chain walk fallback), and the four method-name substitutions
5499/// (`$bt_update`, `$bt_insert`, `$for_each_rep`, `$hash3`) route to the
5500/// kernel-specific wrappers of the inner helpers. With every helper under
5501/// the same `target_feature` umbrella, the entire per-position pipeline
5502/// (BT-tree fill + rep probing + hash3 probing + BT match collection /
5503/// HC chain walk) inlines without ABI barriers on the level22 hot path.
5504macro_rules! collect_optimal_candidates_initialized_body {
5505    (
5506        $self:expr,
5507        $strategy_ty:ty,
5508        $abs_pos:ident,
5509        $current_abs_end:ident,
5510        $profile:ident,
5511        $query:ident,
5512        $out:ident,
5513        $bt_matchfinder:ident,
5514        $bt_update:ident,
5515        $bt_insert:ident,
5516        $for_each_rep:ident,
5517        $hash3:ident,
5518        $cpl:path $(,)?
5519    ) => {{
5520        // Per-strategy compile-time const: only BtUltra2 drives the
5521        // hash3 short-match table. All other monomorphisations drop
5522        // the entire hash3 lookup block at codegen time. The relaxed
5523        // implication enforces only the direction we depend on:
5524        // if the strategy declares hash3, the table must be live.
5525        // The reverse (`hash3_log != 0` without `USE_HASH3`) is OK —
5526        // a future caller may pre-allocate hash3 storage without
5527        // wiring the BtUltra2 path through.
5528        let use_hash3: bool = <$strategy_ty as super::strategy::Strategy>::USE_HASH3;
5529        debug_assert!(!$self.table.hash_table.is_empty());
5530        debug_assert!($self.table.hash3_log == 0 || !$self.table.hash3_table.is_empty());
5531        debug_assert!(
5532            !use_hash3 || $self.table.hash3_log != 0,
5533            "Strategy::USE_HASH3 = true but runtime hash3_log is 0 — call configure() first",
5534        );
5535        debug_assert!(!$self.table.chain_table.is_empty());
5536        let min_match_len = HC_OPT_MIN_MATCH_LEN;
5537        let reps = $query.reps;
5538        let lit_len = $query.lit_len;
5539        let ldm_candidate = $query.ldm_candidate;
5540        $out.clear();
5541        if $abs_pos < $self.table.skip_insert_until_abs {
5542            if let Some(ldm) = ldm_candidate {
5543                let mut best_len_for_skip = 0usize;
5544                let _ = super::bt::BtMatcher::push_candidate_ladder(
5545                    $out,
5546                    &mut best_len_for_skip,
5547                    ldm,
5548                    min_match_len,
5549                );
5550            }
5551            return;
5552        }
5553        if $bt_matchfinder {
5554            // SAFETY: caller is in the same target_feature umbrella as
5555            // `$bt_update`; the runtime kernel detector already gated entry.
5556            unsafe { $self.table.$bt_update($abs_pos, $current_abs_end) };
5557        }
5558        let current_idx = $abs_pos - $self.table.history_abs_start;
5559        if current_idx + 4 > $self.table.live_history().len() {
5560            if let Some(ldm) = ldm_candidate {
5561                let mut best_len_for_skip = 0usize;
5562                let _ = super::bt::BtMatcher::push_candidate_ladder(
5563                    $out,
5564                    &mut best_len_for_skip,
5565                    ldm,
5566                    min_match_len,
5567                );
5568            }
5569            return;
5570        }
5571        let mut best_len_for_skip = 0usize;
5572        let mut skip_further_match_search = false;
5573        let mut rep_len_candidate_found = false;
5574        // SAFETY: same umbrella; closure capture is monomorphized per call.
5575        unsafe {
5576            $self.hc.$for_each_rep(
5577                &$self.table,
5578                $abs_pos,
5579                lit_len,
5580                reps,
5581                $current_abs_end,
5582                min_match_len,
5583                |rep| {
5584                    if rep.match_len >= min_match_len {
5585                        rep_len_candidate_found = true;
5586                    }
5587                    let _ = super::bt::BtMatcher::push_candidate_ladder(
5588                        $out,
5589                        &mut best_len_for_skip,
5590                        rep,
5591                        min_match_len,
5592                    );
5593                    if rep.match_len > $profile.sufficient_match_len {
5594                        skip_further_match_search = true;
5595                    }
5596                    // `for_each_repcode_candidate_with_reps` caps
5597                    // `rep.match_len` at the per-call `tail_limit =
5598                    // current_abs_end - abs_pos`, so `abs_pos +
5599                    // rep.match_len <= current_abs_end`. The raw sum
5600                    // therefore stays in `usize` on every supported
5601                    // target.
5602                    if $abs_pos + rep.match_len >= $current_abs_end {
5603                        skip_further_match_search = true;
5604                    }
5605                },
5606            )
5607        };
5608        // Hash3 lookup runs only when the strategy enables it. The
5609        // `use_hash3` binding above is a per-monomorphisation const,
5610        // so non-BtUltra2 instances drop this entire block.
5611        if use_hash3 && !skip_further_match_search && best_len_for_skip < min_match_len {
5612            $self.table.update_hash3_until($abs_pos);
5613            // SAFETY: same umbrella for hash3_candidate.
5614            if let Some(h3) = unsafe {
5615                $self
5616                    .table
5617                    .$hash3($abs_pos, $current_abs_end, min_match_len)
5618            } {
5619                let _ = super::bt::BtMatcher::push_candidate_ladder(
5620                    $out,
5621                    &mut best_len_for_skip,
5622                    h3,
5623                    min_match_len,
5624                );
5625                if !rep_len_candidate_found
5626                    && (h3.match_len > $profile.sufficient_match_len
5627                        || $abs_pos + h3.match_len >= $current_abs_end)
5628                {
5629                    $self.table.skip_insert_until_abs = $abs_pos + 1;
5630                    skip_further_match_search = true;
5631                }
5632            }
5633        }
5634        if !skip_further_match_search && $bt_matchfinder {
5635            // SAFETY: same umbrella for bt_insert_and_collect_matches.
5636            unsafe {
5637                $self.table.$bt_insert(
5638                    $abs_pos,
5639                    $current_abs_end,
5640                    $profile,
5641                    min_match_len,
5642                    &mut best_len_for_skip,
5643                    $out,
5644                )
5645            };
5646        } else if !skip_further_match_search {
5647            $self.table.insert_position($abs_pos);
5648            let max_chain_depth = $profile.max_chain_depth.min($self.hc.search_depth);
5649            let concat = $self.table.live_history();
5650            // Raw `+ 9` is safe here — see `bt_insert_step_no_rebase_body!`
5651            // for the full discussion of the upstream `STREAM_ABS_HEADROOM`
5652            // cap in `MatchTable::add_data`.
5653            let mut match_end_abs = $abs_pos + 9;
5654            if max_chain_depth > 0 {
5655                for (visited, candidate_abs) in $self
5656                    .hc
5657                    .chain_candidates(&$self.table, $abs_pos)
5658                    .into_iter()
5659                    .enumerate()
5660                {
5661                    if visited >= max_chain_depth {
5662                        break;
5663                    }
5664                    if candidate_abs == usize::MAX {
5665                        break;
5666                    }
5667                    if candidate_abs < $self.table.window_low_abs_for_target($abs_pos)
5668                        || candidate_abs >= $abs_pos
5669                    {
5670                        continue;
5671                    }
5672                    let candidate_idx = candidate_abs - $self.table.history_abs_start;
5673                    debug_assert!(
5674                        $abs_pos <= $current_abs_end,
5675                        "HC chain walker called past current block end"
5676                    );
5677                    let tail_limit = $current_abs_end - $abs_pos;
5678                    let base = concat.as_ptr();
5679                    // SAFETY: history-relative indices; `tail_limit` bounds
5680                    // the scan within `concat`. `$cpl` is the kernel-specific
5681                    // common_prefix_len_ptr — call inlines because the
5682                    // surrounding wrapper carries the same target_feature.
5683                    let match_len =
5684                        unsafe { $cpl(base.add(candidate_idx), base.add(current_idx), tail_limit) };
5685                    if match_len < min_match_len {
5686                        continue;
5687                    }
5688                    let offset = $abs_pos - candidate_abs;
5689                    if super::bt::BtMatcher::push_candidate_ladder(
5690                        $out,
5691                        &mut best_len_for_skip,
5692                        MatchCandidate {
5693                            start: $abs_pos,
5694                            offset,
5695                            match_len,
5696                        },
5697                        min_match_len,
5698                    ) {
5699                        let candidate_end = candidate_abs + match_len;
5700                        if candidate_end > match_end_abs {
5701                            match_end_abs = candidate_end;
5702                        }
5703                    }
5704                    if match_len > HC_OPT_NUM || $abs_pos + match_len >= $current_abs_end {
5705                        break;
5706                    }
5707                }
5708            }
5709            // `match_end_abs` initialized to `abs_pos + 9`; monotonic
5710            // updates only ever extend it, so `match_end_abs - 8 >= 1`.
5711            $self.table.skip_insert_until_abs =
5712                $self.table.skip_insert_until_abs.max(match_end_abs - 8);
5713        }
5714        if let Some(ldm) = ldm_candidate {
5715            let _ = super::bt::BtMatcher::push_candidate_ladder(
5716                $out,
5717                &mut best_len_for_skip,
5718                ldm,
5719                min_match_len,
5720            );
5721        }
5722    }};
5723}
5724
5725/// `hash3_candidate` body parameterized over the per-CPU
5726/// `common_prefix_len_ptr` symbol. The hash3 probe checks one candidate per
5727/// position when invoked, so the per-call ABI savings compound across the
5728/// segment. Crate-private (see `bt_insert_step_no_rebase_body!`).
5729macro_rules! hash3_candidate_body {
5730    (
5731        $table:expr,
5732        $abs_pos:ident,
5733        $current_abs_end:ident,
5734        $min_match_len:ident,
5735        $cpl:path $(,)?
5736    ) => {{
5737        if $table.hash3_log == 0 {
5738            return None;
5739        }
5740        let idx = $abs_pos.checked_sub($table.history_abs_start)?;
5741        let concat = $table.live_history();
5742        if idx + 4 > concat.len() {
5743            return None;
5744        }
5745        let hash3 = $crate::encoding::match_table::storage::MatchTable::hash_position_at(
5746            concat,
5747            idx,
5748            $table.hash3_log,
5749            3,
5750        );
5751        let entry = $table
5752            .hash3_table
5753            .get(hash3)
5754            .copied()
5755            .unwrap_or($crate::encoding::match_table::storage::HC_EMPTY);
5756        let candidate_abs =
5757            $crate::encoding::match_table::storage::MatchTable::stored_abs_position_fast(
5758                entry,
5759                $table.position_base,
5760                $table.index_shift,
5761            )?;
5762        if candidate_abs < $table.history_abs_start || candidate_abs >= $abs_pos {
5763            return None;
5764        }
5765        let offset = $abs_pos - candidate_abs;
5766        if offset >= $crate::encoding::bt::HC3_MAX_OFFSET {
5767            return None;
5768        }
5769        let candidate_idx = candidate_abs - $table.history_abs_start;
5770        let tail_limit = $current_abs_end.saturating_sub($abs_pos);
5771        let base = concat.as_ptr();
5772        // SAFETY: candidate/idx are within history range; tail_limit
5773        // bounds the scan within `concat`.
5774        let match_len = unsafe { $cpl(base.add(candidate_idx), base.add(idx), tail_limit) };
5775        (match_len >= $min_match_len).then_some($crate::encoding::opt::types::MatchCandidate {
5776            start: $abs_pos,
5777            offset,
5778            match_len,
5779        })
5780    }};
5781}
5782pub(crate) use hash3_candidate_body;
5783
5784/// `for_each_repcode_candidate_with_reps` body parameterized over the per-CPU
5785/// `common_prefix_len_ptr` symbol so the per-rep prefix probe inlines under
5786/// the wrapper's `target_feature` umbrella instead of crossing the ABI
5787/// boundary through the dispatcher. Three rep probes per encoded position →
5788/// thousands per segment, so the per-call barrier was non-trivial.
5789///
5790/// The callback `f` runs in the wrapper's umbrella context too, so closures
5791/// that capture mutable state still work (FnMut). Crate-private
5792/// (see `bt_insert_step_no_rebase_body!`).
5793macro_rules! for_each_repcode_candidate_body {
5794    (
5795        $table:expr,
5796        $abs_pos:ident,
5797        $lit_len:ident,
5798        $reps:ident,
5799        $current_abs_end:ident,
5800        $min_match_len:ident,
5801        $f:ident,
5802        $cpl:path $(,)?
5803    ) => {{
5804        let rep_offsets: [Option<usize>; 3] = if $lit_len == 0 {
5805            [
5806                Some($reps[1] as usize),
5807                Some($reps[2] as usize),
5808                ($reps[0] > 1).then_some(($reps[0] - 1) as usize),
5809            ]
5810        } else {
5811            [
5812                Some($reps[0] as usize),
5813                Some($reps[1] as usize),
5814                Some($reps[2] as usize),
5815            ]
5816        };
5817        let concat = $table.live_history();
5818        let current_idx = $abs_pos - $table.history_abs_start;
5819        if current_idx + 4 > concat.len() {
5820            return;
5821        }
5822        let tail_limit = $current_abs_end.saturating_sub($abs_pos);
5823        let base = concat.as_ptr();
5824        let concat_len = concat.len();
5825        for rep in rep_offsets.into_iter().flatten() {
5826            if rep == 0 || rep > $abs_pos {
5827                continue;
5828            }
5829            let candidate_pos = $abs_pos - rep;
5830            if candidate_pos < $table.history_abs_start {
5831                continue;
5832            }
5833            let candidate_idx = candidate_pos - $table.history_abs_start;
5834            // Upstream zstd `ZSTD_readMINMATCH` gate (zstd_opt.c:657-674): a
5835            // 4-byte (3-byte when min_match_len == 3) equality probe
5836            // before the full prefix scan. Equivalent filtering — a
5837            // mismatch here means `match_len < min_match_len`, which
5838            // the post-scan check rejects anyway — but it skips the
5839            // prefix-kernel call for the common no-match case (rep
5840            // offsets rarely hit on low-redundancy input).
5841            //
5842            // SAFETY: `current_idx + 4 <= concat_len` (early return
5843            // above) and `candidate_idx < current_idx` (rep >= 1), so
5844            // both 4-byte reads stay inside `concat`.
5845            let gate_matches = unsafe {
5846                let cand = base.add(candidate_idx).cast::<u32>().read_unaligned();
5847                let cur = base.add(current_idx).cast::<u32>().read_unaligned();
5848                if $min_match_len == 3 {
5849                    // Compare the low-address 3 bytes regardless of
5850                    // endianness: byte-shift on LE, mask via to_le.
5851                    (cand.to_le() & 0x00FF_FFFF) == (cur.to_le() & 0x00FF_FFFF)
5852                } else {
5853                    cand == cur
5854                }
5855            };
5856            if !gate_matches {
5857                continue;
5858            }
5859            // SAFETY: `candidate_idx ≤ current_idx < concat_len` (since
5860            // candidate_pos ≤ abs_pos and we early-returned on
5861            // `current_idx + 4 > concat_len`). `max` clamps to the shorter
5862            // remaining run so neither pointer overruns `concat`.
5863            let max = (concat_len - candidate_idx)
5864                .min(concat_len - current_idx)
5865                .min(tail_limit);
5866            let match_len = unsafe { $cpl(base.add(candidate_idx), base.add(current_idx), max) };
5867            if match_len < $min_match_len {
5868                continue;
5869            }
5870            $f(MatchCandidate {
5871                start: $abs_pos,
5872                offset: rep,
5873                match_len,
5874            });
5875        }
5876    }};
5877}
5878pub(crate) use for_each_repcode_candidate_body;
5879
5880/// `bt_insert_and_collect_matches` body parameterized over the per-CPU
5881/// `count_match_from_indices` symbol. Same shape as
5882/// [`bt_insert_step_no_rebase_body`] — picks up the matching kernel through
5883/// `$cmf` so the per-iteration vector probe inlines under the wrapper's
5884/// `target_feature` umbrella. Returns nothing (matches the original method).
5885/// Crate-private (see `bt_insert_step_no_rebase_body!`).
5886macro_rules! bt_insert_and_collect_matches_body {
5887    (
5888        $table:expr,
5889        $search_depth:expr,
5890        $abs_pos:ident,
5891        $current_abs_end:ident,
5892        $profile:ident,
5893        $min_match_len:ident,
5894        $best_len_for_skip:ident,
5895        $out:ident,
5896        $cmf:path $(,)?
5897    ) => {{
5898        let idx = $abs_pos - $table.history_abs_start;
5899        // Borrowed-aware live region (owned: `history[history_start..]`;
5900        // borrowed: the in-place input `[0, block_end)`). Reborrow-then-raw-ptr
5901        // so the slice holds NO borrow and coexists with the `&mut $table`
5902        // binary-tree writes below. Owned is byte-identical (same bytes).
5903        let concat: &[u8] = unsafe {
5904            let lh = $table.live_history();
5905            core::slice::from_raw_parts(lh.as_ptr(), lh.len())
5906        };
5907        if idx + 8 > concat.len() {
5908            return;
5909        }
5910        debug_assert!(
5911            $abs_pos <= $current_abs_end,
5912            "BT collect called past current block end"
5913        );
5914        let tail_limit = $current_abs_end - $abs_pos;
5915        let hash = $crate::encoding::match_table::storage::MatchTable::hash_position_at(
5916            concat,
5917            idx,
5918            $table.hash_log,
5919            $table.search_mls,
5920        );
5921        // Prefetch the hash bucket now. For the large L16+ hash table over
5922        // high-entropy input the bucket is L3/DRAM-cold, and unlike upstream's
5923        // monolithic ZSTD_btGetAllMatches (which overlaps this miss with its
5924        // inline rep/hash3 prologue) the read+write of `hash_table[hash]`
5925        // below is reached with nothing to hide it behind — it stalled a large
5926        // share of this function's cycles. Issuing the hint here lets the miss
5927        // overlap the address setup that follows.
5928        #[cfg(all(
5929            target_feature = "sse",
5930            any(target_arch = "x86", target_arch = "x86_64")
5931        ))]
5932        {
5933            #[cfg(target_arch = "x86")]
5934            use core::arch::x86::{_MM_HINT_T0, _mm_prefetch};
5935            #[cfg(target_arch = "x86_64")]
5936            use core::arch::x86_64::{_MM_HINT_T0, _mm_prefetch};
5937            // SAFETY: prefetch is a hint that never faults; `hash` indexes
5938            // `hash_table` directly below, so it is in bounds.
5939            unsafe {
5940                _mm_prefetch($table.hash_table.as_ptr().add(hash).cast(), _MM_HINT_T0);
5941            }
5942            // Prefetch the NEXT position's bucket too. The optimal-parser DP
5943            // advances one position per iteration, so this miss is issued a
5944            // full BT walk plus the next iteration's pre-collect work ahead of
5945            // the collect that will read it — far more lead than the same-call
5946            // hint above, enough to hide the full DRAM latency.
5947            if idx + 1 + 8 <= concat.len() {
5948                let hash_next =
5949                    $crate::encoding::match_table::storage::MatchTable::hash_position_at(
5950                        concat,
5951                        idx + 1,
5952                        $table.hash_log,
5953                        $table.search_mls,
5954                    );
5955                // SAFETY: prefetch never faults; an out-of-range index is a
5956                // harmless no-op hint.
5957                unsafe {
5958                    _mm_prefetch(
5959                        $table.hash_table.as_ptr().add(hash_next).cast(),
5960                        _MM_HINT_T0,
5961                    );
5962                }
5963            }
5964        }
5965        let Some(relative_pos) = $table.relative_position($abs_pos) else {
5966            return;
5967        };
5968        let stored = relative_pos + 1;
5969        let bt_mask = $table.bt_mask();
5970        // Hoist the BT pointer-pair table's base out of `self` once: every
5971        // access below is `chain_table[computed_index]` through `&mut self`,
5972        // which the optimizer cannot prove loop-invariant, so it reloads the
5973        // Vec's (ptr,len) from the struct AND bounds-checks on every tree
5974        // step (the upstream zstd walks a raw `U32* btable`, zstd_opt.c). The raw
5975        // base carries no borrow, so the `&self` helper calls in the loop
5976        // (`bt_pair_index_for_abs`, `window_low_abs_for_target`,
5977        // `relative_position`) coexist — they read other fields, never
5978        // `chain_table`. Indices are in bounds by the BT invariants:
5979        // `bt_pair_index_for_abs` returns `2*(abs & bt_mask) (+1)` ≤
5980        // `chain_table.len()-1`, and the slots only ever hold those values.
5981        let chain_ptr = $table.chain_table.as_mut_ptr();
5982        debug_assert_eq!($table.chain_table.len(), 2 << $table.bt_log());
5983        // See `bt_insert_step_no_rebase_body!`: saturating is needed for the
5984        // first BT walk of a fresh frame where `abs_pos < bt_mask`.
5985        let bt_low = $abs_pos.saturating_sub(bt_mask);
5986        let window_low = $table.window_low_abs_for_target($abs_pos);
5987        // Upstream zstd-style window bound in stored space so the BT-walk loop
5988        // condition rejects out-of-window / HC_EMPTY candidates WITHOUT
5989        // decoding them (mirrors upstream `while ... matchIndex >= matchLow`):
5990        // one range check on `match_stored` instead of decode-then-break,
5991        // dropping the wasted candidate_abs decode on every walk's terminating
5992        // step. candidate_abs(s) = (position_base + s - 1) - index_shift =
5993        // base + s (wrapping); in-window ⟺ candidate_abs - window_low <
5994        // abs_pos - window_low ⟺ s.wrapping_add(win_off) < win_range.
5995        // HC_EMPTY (s = 0) maps to base = (lowest representable abs) - 1 <
5996        // window_low, so it falls out of range and ends the walk.
5997        let win_off = $table
5998            .position_base
5999            .wrapping_sub(1)
6000            .wrapping_sub($table.index_shift)
6001            .wrapping_sub(window_low);
6002        let win_range = $abs_pos - window_low;
6003        // Raw `+ 9` is safe here — see `bt_insert_step_no_rebase_body!`
6004        // for the full discussion of the upstream `STREAM_ABS_HEADROOM`
6005        // cap in `MatchTable::add_data`.
6006        let mut match_end_abs = $abs_pos + 9;
6007        let mut compares_left = $profile.max_chain_depth.min($search_depth);
6008        let mut common_length_smaller = 0usize;
6009        let mut common_length_larger = 0usize;
6010        let pair_idx = $table.bt_pair_index_for_abs($abs_pos);
6011        let mut smaller_slot = pair_idx;
6012        let mut larger_slot = pair_idx + 1;
6013        let mut match_stored = $table.hash_table[hash];
6014        $table.hash_table[hash] = stored;
6015        // Upstream zstd semantics: `bestLength` starts at `lengthToBeat - 1`; rep/hash3
6016        // probing may raise it; BT then only reports strictly longer matches.
6017        // `min_match_len >= HC_FORMAT_MINMATCH (3)` by configure invariant,
6018        // so `min_match_len - 1 >= 2` cannot underflow.
6019        debug_assert!(
6020            $min_match_len >= $crate::encoding::cost_model::HC_FORMAT_MINMATCH,
6021            "min_match_len must be at least HC_FORMAT_MINMATCH"
6022        );
6023        let mut best_len = (*$best_len_for_skip).max($min_match_len - 1);
6024
6025        // Upstream zstd-form loop condition: the stored-space window range check
6026        // (`s.wrapping_add(win_off) < win_range`) rejects out-of-window and
6027        // HC_EMPTY candidates here, so the terminating step never enters the
6028        // body — no wasted candidate_abs decode, matching upstream's
6029        // `while ... matchIndex >= matchLow`.
6030        while compares_left > 0 && (match_stored as usize).wrapping_add(win_off) < win_range {
6031            compares_left -= 1;
6032            // The condition proved this candidate is in `[window_low,
6033            // abs_pos)`, so `match_stored >= 1` (HC_EMPTY is out of range) and
6034            // the `- 1` cannot underflow; candidate_abs == base + match_stored.
6035            let candidate_abs = ($table.position_base + (match_stored as usize - 1))
6036                .wrapping_sub($table.index_shift);
6037
6038            let next_pair_idx = $table.bt_pair_index_for_abs(candidate_abs);
6039            // SAFETY: `next_pair_idx (+1)` = `2*(candidate_abs & bt_mask) (+1)`
6040            // ≤ `chain_table.len()-1`; `chain_ptr` is the hoisted live base,
6041            // table not realloc'd during the walk.
6042            let next_smaller = unsafe { *chain_ptr.add(next_pair_idx) };
6043            let next_larger = unsafe { *chain_ptr.add(next_pair_idx + 1) };
6044            let seed_len = common_length_smaller.min(common_length_larger);
6045            let candidate_idx = candidate_abs - $table.history_abs_start;
6046            // SAFETY: BT walk invariant — `candidate_idx + tail_limit ≤
6047            // concat.len()`.
6048            let match_len = unsafe { $cmf(concat, idx, candidate_idx, tail_limit, seed_len) };
6049
6050            if match_len > best_len {
6051                let offset = $abs_pos - candidate_abs;
6052                let accepted = $crate::encoding::bt::BtMatcher::push_candidate_ladder(
6053                    $out,
6054                    $best_len_for_skip,
6055                    $crate::encoding::opt::types::MatchCandidate {
6056                        start: $abs_pos,
6057                        offset,
6058                        match_len,
6059                    },
6060                    $min_match_len,
6061                );
6062                if accepted {
6063                    best_len = match_len;
6064                    // BT walker invariants: `candidate_abs < abs_pos`
6065                    // and `match_len <= tail_limit = current_abs_end -
6066                    // abs_pos`. So `candidate_abs + match_len <
6067                    // abs_pos + tail_limit = current_abs_end`, which
6068                    // fits in `usize` on every supported target (32-bit
6069                    // i686 included) — the addition stays within the
6070                    // current block.
6071                    let candidate_end = candidate_abs + match_len;
6072                    if candidate_end > match_end_abs {
6073                        match_end_abs = candidate_end;
6074                    }
6075                    if match_len >= tail_limit
6076                        || match_len > $crate::encoding::cost_model::HC_OPT_NUM
6077                    {
6078                        break;
6079                    }
6080                }
6081            }
6082
6083            if match_len >= tail_limit {
6084                break;
6085            }
6086
6087            let candidate_next = candidate_idx + match_len;
6088            let current_next = idx + match_len;
6089            // SAFETY: first-differing positions after a match_len-long prefix;
6090            // match_len < tail_limit (break above) + BT-walk bound
6091            // idx/candidate_idx + tail_limit <= concat.len() keep both in range.
6092            if unsafe {
6093                *concat.get_unchecked(candidate_next) < *concat.get_unchecked(current_next)
6094            } {
6095                // SAFETY: `smaller_slot` holds a valid pair index (init
6096                // `pair_idx`, updated to `next_pair_idx + 1`); the `usize::MAX`
6097                // sentinel is set only just before `break`, never written here.
6098                unsafe { *chain_ptr.add(smaller_slot) = match_stored };
6099                common_length_smaller = match_len;
6100                if candidate_abs <= bt_low {
6101                    smaller_slot = usize::MAX;
6102                    break;
6103                }
6104                smaller_slot = next_pair_idx + 1;
6105                match_stored = next_larger;
6106            } else {
6107                // SAFETY: as above for `larger_slot`.
6108                unsafe { *chain_ptr.add(larger_slot) = match_stored };
6109                common_length_larger = match_len;
6110                if candidate_abs <= bt_low {
6111                    larger_slot = usize::MAX;
6112                    break;
6113                }
6114                larger_slot = next_pair_idx;
6115                match_stored = next_smaller;
6116            }
6117        }
6118
6119        // SAFETY: both slots, when not the `usize::MAX` sentinel, hold valid
6120        // pair indices into the hoisted `chain_table` base.
6121        if smaller_slot != usize::MAX {
6122            unsafe {
6123                *chain_ptr.add(smaller_slot) = $crate::encoding::match_table::storage::HC_EMPTY
6124            };
6125        }
6126        if larger_slot != usize::MAX {
6127            unsafe {
6128                *chain_ptr.add(larger_slot) = $crate::encoding::match_table::storage::HC_EMPTY
6129            };
6130        }
6131
6132        // Dict dual-probe (upstream zstd `ZSTD_dictMatchState`, zstd_opt.c:777-813):
6133        // after the live tree, descend the immutable dictionary BINARY TREE
6134        // (built in `prime_dms_bt`) with its OWN compare budget and push any
6135        // dict match longer than the live best into the ladder. The DUBT
6136        // descent reaches the longest dict match efficiently (a hash-chain
6137        // surfaced only the few same-bucket candidates and left most of the
6138        // dict savings unrealised at btlazy2 / btopt). Dict positions are
6139        // dictionary-relative concat indices in `[0, region)`, pinned at the
6140        // front of history, so a dict candidate at `dict_idx` sits at offset
6141        // `idx - dict_idx` (no upstream zstd `dmsIndexDelta`). The optimal parser
6142        // prices these (its DP lookahead values the repcode chain a dict match
6143        // seeds); the greedy/lazy parser commits the longest.
6144        if let Some(dms) = $table.dms.table() {
6145            let region = $table.dms.region_len();
6146            let dh = $crate::encoding::match_table::storage::MatchTable::hash_position_at(
6147                concat,
6148                idx,
6149                dms.hash_log,
6150                dms.mls,
6151            );
6152            let mut dcur = dms.hash_table[dh];
6153            // DUBT seed lengths: bytes already known common on each side, so
6154            // `$cmf` resumes from there (upstream zstd commonLengthSmaller/Larger).
6155            let mut common_smaller = 0usize;
6156            let mut common_larger = 0usize;
6157            let mut dms_compares = $profile.max_chain_depth.min($search_depth);
6158            while dms_compares > 0 && dcur != $crate::encoding::match_table::storage::HC_EMPTY {
6159                let dict_idx = (dcur - 1) as usize;
6160                // The dict tree holds only dict positions (`< region <= idx`).
6161                if dict_idx >= region || dict_idx >= idx {
6162                    break;
6163                }
6164                dms_compares -= 1;
6165                let pair = 2 * dict_idx;
6166                let seed = common_smaller.min(common_larger);
6167                // SAFETY: `dict_idx < idx` and `idx + tail_limit <=
6168                // concat.len()` (checked at entry); same umbrella as the live
6169                // walk's `$cmf`. `seed <= prior match_len <= tail_limit`.
6170                let match_len = unsafe { $cmf(concat, idx, dict_idx, tail_limit, seed) };
6171                if match_len > best_len {
6172                    let offset = idx - dict_idx;
6173                    let accepted = $crate::encoding::bt::BtMatcher::push_candidate_ladder(
6174                        $out,
6175                        $best_len_for_skip,
6176                        $crate::encoding::opt::types::MatchCandidate {
6177                            start: $abs_pos,
6178                            offset,
6179                            match_len,
6180                        },
6181                        $min_match_len,
6182                    );
6183                    if accepted {
6184                        best_len = match_len;
6185                        let candidate_end = $abs_pos + match_len;
6186                        if candidate_end > match_end_abs {
6187                            match_end_abs = candidate_end;
6188                        }
6189                        if match_len > $crate::encoding::cost_model::HC_OPT_NUM {
6190                            break;
6191                        }
6192                    }
6193                }
6194                // Match reached the block tail: can't order the pair (upstream zstd
6195                // `ip+matchLength == iLimit`), and indexing `concat[idx +
6196                // match_len]` below would step past the searchable region.
6197                if match_len >= tail_limit {
6198                    break;
6199                }
6200                // Descend the DUBT (upstream zstd zstd_opt.c:806-811): dict candidate
6201                // smaller than input → its larger child is closer to `idx`.
6202                if concat[dict_idx + match_len] < concat[idx + match_len] {
6203                    common_smaller = match_len;
6204                    dcur = dms.chain_table[pair + 1];
6205                } else {
6206                    common_larger = match_len;
6207                    dcur = dms.chain_table[pair];
6208                }
6209            }
6210        }
6211
6212        // `match_end_abs >= abs_pos + 9 >= 9` (initialized and monotonic),
6213        // so `match_end_abs - 8 >= 1` cannot underflow.
6214        $table.skip_insert_until_abs = match_end_abs - 8;
6215    }};
6216}
6217pub(crate) use bt_insert_and_collect_matches_body;
6218
6219impl HcMatchGenerator {
6220    /// Heap bytes this generator owns: the shared match table plus the BT
6221    /// backend's optimal-parser / LDM scratch (the HC knobs are inline).
6222    fn heap_size(&self) -> usize {
6223        self.table.heap_size() + self.backend.heap_size()
6224    }
6225
6226    fn should_run_btultra2_seed_pass<S: super::strategy::Strategy>(
6227        &self,
6228        current_len: usize,
6229    ) -> bool {
6230        // The in-block two-pass dynamic-stats seed (`initStats_ultra`)
6231        // is btultra2-only. `TWO_PASS_SEED` is `false` for every other
6232        // strategy — including btultra, which now shares the hash3
6233        // short-match probe but stays single-pass — so the seed call and
6234        // its body drop at codegen time for all non-btultra2 kernels.
6235        if !S::TWO_PASS_SEED {
6236            return false;
6237        }
6238        let HcBackend::Bt(bt) = &self.backend else {
6239            return false;
6240        };
6241        bt.opt_state.lit_length_sum == 0
6242            && bt.opt_state.dictionary_seed.is_none()
6243            && !self.table.dictionary_primed_for_frame
6244            && bt.ldm_sequences.is_empty()
6245            && self.table.window_size == current_len
6246            && self.table.history_abs_start == 0
6247            && self.table.chunk_lens.len() == 1
6248            && current_len > HC_PREDEF_THRESHOLD
6249    }
6250
6251    fn new(max_window_size: usize) -> Self {
6252        Self {
6253            table: super::match_table::storage::MatchTable::new(max_window_size),
6254            hc: super::hc::HcMatcher::new(2, HC_SEARCH_DEPTH, HC_TARGET_LEN),
6255            // Default to the zero-sized HC backend; `configure()` swaps
6256            // in a `BtMatcher` only when an optimal strategy lands.
6257            backend: HcBackend::Hc,
6258            // Lazy is the per-construct default — every production
6259            // caller calls `configure()` before the first encode and
6260            // overwrites this. Tests that drive `HcMatchGenerator`
6261            // without calling `configure()` end up in the
6262            // `start_matching_lazy` arm of the test dispatcher, which
6263            // matches the previous default behaviour.
6264            strategy_tag: super::strategy::StrategyTag::Lazy,
6265        }
6266    }
6267
6268    fn configure(&mut self, config: HcConfig, tag: super::strategy::StrategyTag, window_log: u8) {
6269        use super::strategy::StrategyTag;
6270        // Mirror the driver-resolved strategy tag so the
6271        // `#[cfg(test)] start_matching` dispatcher can route
6272        // BtOpt / BtUltra / BtUltra2 to distinct monomorphisations.
6273        self.strategy_tag = tag;
6274        let is_btultra2 = tag == StrategyTag::BtUltra2;
6275        let uses_bt = matches!(
6276            tag,
6277            StrategyTag::Btlazy2
6278                | StrategyTag::BtOpt
6279                | StrategyTag::BtUltra
6280                | StrategyTag::BtUltra2
6281        );
6282        // btultra and btultra2 both run the mls=3 hash3 short-match probe
6283        // (clevels.h minMatch 3). The `is_btultra2` flag below stays
6284        // exclusive to btultra2 because it tweaks the BT rebase boundary,
6285        // not match finding.
6286        let wants_hash3 = matches!(tag, StrategyTag::BtUltra | StrategyTag::BtUltra2);
6287        let next_hash3_log = if wants_hash3 {
6288            HC3_HASH_LOG.min(window_log as usize)
6289        } else {
6290            0
6291        };
6292        let resize = self.table.hash_log != config.hash_log
6293            || self.table.chain_log != config.chain_log
6294            || self.table.hash3_log != next_hash3_log;
6295        self.table.hash_log = config.hash_log;
6296        self.table.chain_log = config.chain_log;
6297        self.table.hash3_log = next_hash3_log;
6298        self.hc.search_depth = if uses_bt {
6299            config.search_depth
6300        } else {
6301            config.search_depth.min(MAX_HC_SEARCH_DEPTH)
6302        };
6303        self.hc.target_len = config.target_len;
6304        // Mirror strategy-derived flags + HC search depth onto MatchTable
6305        // so the BT walker and rebase machinery can read them directly
6306        // without dispatching back through HcMatchGenerator.
6307        self.table.search_depth = self.hc.search_depth;
6308        self.table.is_btultra2 = is_btultra2;
6309        self.table.uses_bt = uses_bt;
6310        // BT finder hash width, upstream zstd `mls = BOUNDED(4, cParams.minMatch, 6)`,
6311        // carried explicitly in the level config so a `target_length` override
6312        // cannot silently flip the finder between 5- and 4-byte hashing. Only
6313        // the BT body reads it; HC/lazy levels leave it at 4. clevels.h
6314        // (srcSize > 256 KiB tier): btlazy2 L13-15 + btopt L16 are minMatch=5,
6315        // btopt L17 is minMatch=4, btultra/btultra2 are minMatch=3 (4-byte main
6316        // hash + the hash3 short-match probe).
6317        self.table.search_mls = config.search_mls;
6318        // Stage D: promote the backend discriminator. HC modes drop the
6319        // BT scratch buffers entirely; switching back into a BT mode
6320        // allocates a fresh `BtMatcher` on demand.
6321        match (&self.backend, self.table.uses_bt) {
6322            (HcBackend::Hc, true) => {
6323                self.backend = HcBackend::Bt(alloc::boxed::Box::new(super::bt::BtMatcher::new()));
6324            }
6325            (HcBackend::Bt(_), false) => {
6326                self.backend = HcBackend::Hc;
6327            }
6328            _ => {}
6329        }
6330        if resize && !self.table.hash_table.is_empty() {
6331            // Force reallocation on next ensure_tables() call.
6332            self.table.hash_table.clear();
6333            self.table.hash3_table.clear();
6334            self.table.chain_table.clear();
6335        }
6336    }
6337
6338    fn seed_dictionary_entropy(
6339        &mut self,
6340        huff: Option<&crate::huff0::huff0_encoder::HuffmanTable>,
6341        ll: Option<&crate::fse::fse_encoder::FSETable>,
6342        ml: Option<&crate::fse::fse_encoder::FSETable>,
6343        of: Option<&crate::fse::fse_encoder::FSETable>,
6344    ) {
6345        if let HcBackend::Bt(bt) = &mut self.backend {
6346            bt.opt_state.seed_dictionary_entropy(huff, ll, ml, of);
6347        }
6348    }
6349
6350    /// Install (or clear) the long-distance-match producer (#27). Only
6351    /// the BT backend owns an `ldm_producer` slot; on the HC (lazy)
6352    /// backend the producer is dropped because there is no optimal-parser
6353    /// candidate buffer to seed. Call after [`Self::reset`].
6354    #[cfg(feature = "hash")]
6355    fn set_ldm_producer(&mut self, producer: Option<super::ldm::LdmProducer>) {
6356        if let HcBackend::Bt(bt) = &mut self.backend {
6357            bt.ldm_producer = producer;
6358        }
6359    }
6360
6361    /// Move the LDM producer out of the BT backend, leaving `None`. Used by the
6362    /// dictionary snapshot path: the producer carries no dictionary state (LDM
6363    /// is not dict-primed; its hash table is empty at capture), so it is not
6364    /// retained in the snapshot — the working frame's freshly-reset producer is
6365    /// reinstated on restore instead.
6366    #[cfg(feature = "hash")]
6367    fn take_ldm_producer(&mut self) -> Option<super::ldm::LdmProducer> {
6368        if let HcBackend::Bt(bt) = &mut self.backend {
6369            bt.ldm_producer.take()
6370        } else {
6371            None
6372        }
6373    }
6374
6375    fn reset(&mut self, reuse_space: impl FnMut(Vec<u8>)) {
6376        self.table.reset(reuse_space);
6377        if let HcBackend::Bt(bt) = &mut self.backend {
6378            bt.reset();
6379        }
6380    }
6381
6382    /// Backfill positions from the tail of the previous slice that couldn't be
6383    /// hashed at the time (insert_position needs 4 bytes of lookahead).
6384    fn skip_matching(&mut self, incompressible_hint: Option<bool>) {
6385        self.table.skip_matching(incompressible_hint);
6386    }
6387
6388    /// Runtime-dispatched entry kept only for in-crate tests. Production
6389    /// callers reach the inner loops through
6390    /// [`Self::start_matching_strategy`] / [`MatchGeneratorDriver::compress_block`]
6391    /// which pick the lazy / optimal arm from `S::USE_BT` at
6392    /// monomorphisation time.
6393    #[cfg(test)]
6394    fn start_matching(&mut self, mut handle_sequence: impl for<'a> FnMut(Sequence<'a>)) {
6395        use super::strategy::{self, StrategyTag};
6396        // Dispatch on the mirrored `strategy_tag` so each test runs
6397        // under the same monomorphisation production would pick.
6398        // `BtOpt` / `BtUltra` / `BtUltra2` remain distinct here even
6399        // though `table.uses_bt` / `is_btultra2` alone can't separate
6400        // BtOpt from BtUltra.
6401        match self.strategy_tag {
6402            StrategyTag::Fast | StrategyTag::Dfast | StrategyTag::Greedy | StrategyTag::Lazy => {
6403                self.start_matching_lazy(&mut handle_sequence)
6404            }
6405            StrategyTag::Btlazy2 => self.start_matching_btlazy2(&mut handle_sequence),
6406            StrategyTag::BtOpt => {
6407                self.start_matching_optimal::<strategy::BtOpt>(&mut handle_sequence)
6408            }
6409            StrategyTag::BtUltra => {
6410                self.start_matching_optimal::<strategy::BtUltra>(&mut handle_sequence)
6411            }
6412            StrategyTag::BtUltra2 => {
6413                self.start_matching_optimal::<strategy::BtUltra2>(&mut handle_sequence)
6414            }
6415        }
6416    }
6417
6418    /// Strategy-aware entry point used by
6419    /// [`MatchGeneratorDriver::compress_block`]. Branches on
6420    /// `S::USE_BT` — a compile-time `const` — so each
6421    /// monomorphisation keeps exactly one arm: `Lazy` /
6422    /// `Fast` / `Dfast` / `Greedy` see only `start_matching_lazy`,
6423    /// `BtOpt` / `BtUltra` / `BtUltra2` see only
6424    /// `start_matching_optimal`. The inherent test-only
6425    /// [`HcMatchGenerator::start_matching`] reaches the same arms by
6426    /// runtime-matching on `self.strategy_tag` (the parse-mode field
6427    /// has been removed); production never invokes that path.
6428    pub(crate) fn start_matching_strategy<S: super::strategy::Strategy>(
6429        &mut self,
6430        handle_sequence: &mut impl for<'a> FnMut(Sequence<'a>),
6431    ) {
6432        debug_assert_eq!(
6433            self.table.uses_bt,
6434            S::USE_BT,
6435            "Strategy::USE_BT disagrees with runtime table.uses_bt at HC dispatch"
6436        );
6437        if S::USE_BT {
6438            self.start_matching_optimal::<S>(handle_sequence)
6439        } else {
6440            self.start_matching_lazy(handle_sequence)
6441        }
6442    }
6443
6444    /// Dispatcher: pick the dict-aware monomorph when a separate dms is primed
6445    /// (attach-mode dictionary), else the no-dict monomorph. Mirrors upstream's
6446    /// compile-time `dictMode` split — the `DICT = false` body carries no dms
6447    /// code at all, so the no-dict hot path is unaffected by the dict search.
6448    pub(crate) fn start_matching_lazy(
6449        &mut self,
6450        handle_sequence: impl for<'a> FnMut(Sequence<'a>),
6451    ) {
6452        if self.table.dms.is_primed() {
6453            self.start_matching_lazy_impl::<true>(handle_sequence);
6454        } else {
6455            self.start_matching_lazy_impl::<false>(handle_sequence);
6456        }
6457    }
6458
6459    fn start_matching_lazy_impl<const DICT: bool>(
6460        &mut self,
6461        mut handle_sequence: impl for<'a> FnMut(Sequence<'a>),
6462    ) {
6463        self.table.ensure_tables();
6464
6465        // `current_block_range()` is borrowed-aware: owned → last committed
6466        // chunk; borrowed → the staged in-place block range.
6467        let (current_abs_start, current_len) = self.table.current_block_range();
6468        if current_len == 0 {
6469            return;
6470        }
6471        // The current block is the tail of `history` (owned) or the staged
6472        // borrowed range (`get_last_space()` resolves both). Hoist it as a raw
6473        // slice: the routine mutates the hash/chain tables + `offset_hist` but
6474        // never reallocates `history`, so the slice stays valid and we avoid
6475        // re-borrowing `self.table` (which would conflict with the
6476        // `offset_hist` write).
6477        let current_ptr = self.table.get_last_space().as_ptr();
6478        let current: &[u8] = unsafe { core::slice::from_raw_parts(current_ptr, current_len) };
6479
6480        let current_abs_end = current_abs_start + current_len;
6481        self.table
6482            .backfill_boundary_positions(current_abs_start, current_abs_end);
6483
6484        let mut pos = 0usize;
6485        let mut literals_start = 0usize;
6486        while pos + HC_MIN_MATCH_LEN <= current_len {
6487            let abs_pos = current_abs_start + pos;
6488            let lit_len = pos - literals_start;
6489
6490            let best = self
6491                .hc
6492                .find_best_match::<DICT>(&self.table, abs_pos, lit_len);
6493            if let Some(candidate) =
6494                self.hc
6495                    .pick_lazy_match::<DICT>(&self.table, abs_pos, lit_len, best)
6496            {
6497                self.table
6498                    .insert_match_span(abs_pos, candidate.start + candidate.match_len);
6499                let start = candidate.start - current_abs_start;
6500                let literals = &current[literals_start..start];
6501                handle_sequence(Sequence::Triple {
6502                    literals,
6503                    offset: candidate.offset,
6504                    match_len: candidate.match_len,
6505                });
6506                let _ = encode_offset_with_history(
6507                    candidate.offset as u32,
6508                    literals.len() as u32,
6509                    &mut self.table.offset_hist,
6510                );
6511                pos = start + candidate.match_len;
6512                literals_start = pos;
6513            } else {
6514                self.table.insert_position(abs_pos);
6515                // Lazy skipping (upstream zstd `ZSTD_compressBlock_lazy_generic`,
6516                // zstd_lazy.c:1614): advance faster over runs with no match.
6517                // `step = ((ip - anchor) >> kSearchStrength) + 1` with
6518                // kSearchStrength = 8, where `ip - anchor` is the current
6519                // literal-run length. On compressible input the run stays short
6520                // (step == 1, identical to a 1-byte advance); on incompressible
6521                // / dict-over-random input the run grows so the parser skips
6522                // ahead (one search per `step` positions) instead of searching
6523                // every byte. Skipped positions are not inserted, mirroring
6524                // upstream (it inserts only searched positions during a no-match
6525                // run). Ratio follows upstream (not byte-identical).
6526                let step = ((pos - literals_start) >> 8) + 1;
6527                pos += step;
6528                // No clamp needed before the tail loop: the search bound and the
6529                // hashable bound are both `pos + HC_MIN_MATCH_LEN <= current_len`
6530                // (HC_MIN_MATCH_LEN == 4 == the insert width), so there is no
6531                // non-searchable-but-hashable anchor to miss. Positions the skip
6532                // jumps over inside the searchable region are intentionally not
6533                // inserted — same as upstream zstd, which advances past them via
6534                // the identical `ip += step` and never hashes them either.
6535            }
6536        }
6537
6538        // Insert remaining hashable positions in the tail (the matching loop
6539        // stops at HC_MIN_MATCH_LEN but insert_position only needs 4 bytes).
6540        while pos + 4 <= current_len {
6541            self.table.insert_position(current_abs_start + pos);
6542            pos += 1;
6543        }
6544
6545        if literals_start < current_len {
6546            handle_sequence(Sequence::Literals {
6547                literals: &current[literals_start..],
6548            });
6549        }
6550    }
6551
6552    /// Register the borrowed input window for the no-copy one-shot path.
6553    /// # Safety
6554    /// `buffer` must outlive the borrowed scans (see `MatchTable`).
6555    pub(crate) unsafe fn set_borrowed_window(&mut self, buffer: &[u8]) {
6556        // SAFETY: forwarded liveness contract.
6557        unsafe { self.table.set_borrowed_window(buffer) };
6558    }
6559
6560    pub(crate) fn clear_borrowed_window(&mut self) {
6561        self.table.clear_borrowed_window();
6562    }
6563
6564    /// Borrowed (no-copy) equivalent of [`Self::start_matching_lazy`]: stage
6565    /// the in-place block range, then run the same lazy chain parse. The
6566    /// parse reads its range via `current_block_range()` and its bytes via
6567    /// `get_last_space()` / `live_history()`, all borrowed-aware, so the block
6568    /// is scanned in place with the per-position window_low offset cap.
6569    pub(crate) fn start_matching_lazy_borrowed(
6570        &mut self,
6571        block_start: usize,
6572        block_end: usize,
6573        handle_sequence: impl for<'a> FnMut(Sequence<'a>),
6574    ) {
6575        self.table.stage_borrowed_block(block_start, block_end);
6576        self.start_matching_lazy(handle_sequence);
6577    }
6578
6579    /// Borrowed (no-copy) equivalent of the lazy `skip_matching`: stage the
6580    /// in-place block, then seed positions without an owned-history append.
6581    pub(crate) fn skip_matching_borrowed(
6582        &mut self,
6583        block_start: usize,
6584        block_end: usize,
6585        incompressible_hint: Option<bool>,
6586    ) {
6587        self.table.stage_borrowed_block(block_start, block_end);
6588        self.table.skip_matching(incompressible_hint);
6589    }
6590
6591    /// Upstream zstd `ZSTD_btlazy2` (levels 13-15): binary-tree match finder with a
6592    /// greedy/lazy parse. Bare dispatcher — resolves the runtime tier ONCE
6593    /// per block via `select_kernel()` and calls the matching
6594    /// `start_matching_btlazy2_<kernel>` wrapper, so the per-position BT
6595    /// collect runs under a single `#[target_feature]` umbrella (mirrors
6596    /// `build_optimal_plan_impl`). See `start_matching_btlazy2_body!` for the
6597    /// shared loop.
6598    fn start_matching_btlazy2(&mut self, mut handle_sequence: impl for<'a> FnMut(Sequence<'a>)) {
6599        #[cfg(all(target_arch = "aarch64", target_endian = "little"))]
6600        unsafe {
6601            self.start_matching_btlazy2_neon(&mut handle_sequence)
6602        }
6603        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
6604        {
6605            use crate::encoding::fastpath::{FastpathKernel, select_kernel};
6606            match select_kernel() {
6607                FastpathKernel::Avx2Bmi2 => unsafe {
6608                    self.start_matching_btlazy2_avx2_bmi2(&mut handle_sequence)
6609                },
6610                FastpathKernel::Sse42 => unsafe {
6611                    self.start_matching_btlazy2_sse42(&mut handle_sequence)
6612                },
6613                FastpathKernel::Scalar => self.start_matching_btlazy2_scalar(&mut handle_sequence),
6614            }
6615        }
6616        #[cfg(not(any(
6617            all(target_arch = "aarch64", target_endian = "little"),
6618            target_arch = "x86",
6619            target_arch = "x86_64"
6620        )))]
6621        {
6622            self.start_matching_btlazy2_scalar(&mut handle_sequence)
6623        }
6624    }
6625
6626    #[cfg(all(target_arch = "aarch64", target_endian = "little"))]
6627    #[target_feature(enable = "neon")]
6628    unsafe fn start_matching_btlazy2_neon(
6629        &mut self,
6630        mut handle_sequence: impl for<'a> FnMut(Sequence<'a>),
6631    ) {
6632        start_matching_btlazy2_body!(
6633            self,
6634            handle_sequence,
6635            collect_optimal_candidates_initialized_neon,
6636            crate::encoding::fastpath::neon::count_match_from_indices
6637        )
6638    }
6639
6640    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
6641    #[target_feature(enable = "sse4.2")]
6642    unsafe fn start_matching_btlazy2_sse42(
6643        &mut self,
6644        mut handle_sequence: impl for<'a> FnMut(Sequence<'a>),
6645    ) {
6646        start_matching_btlazy2_body!(
6647            self,
6648            handle_sequence,
6649            collect_optimal_candidates_initialized_sse42,
6650            crate::encoding::fastpath::sse42::count_match_from_indices
6651        )
6652    }
6653
6654    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
6655    #[target_feature(enable = "avx2,bmi2")]
6656    unsafe fn start_matching_btlazy2_avx2_bmi2(
6657        &mut self,
6658        mut handle_sequence: impl for<'a> FnMut(Sequence<'a>),
6659    ) {
6660        start_matching_btlazy2_body!(
6661            self,
6662            handle_sequence,
6663            collect_optimal_candidates_initialized_avx2_bmi2,
6664            crate::encoding::fastpath::avx2_bmi2::count_match_from_indices
6665        )
6666    }
6667
6668    // Scalar wrapper: no `#[target_feature]`; `$collect` (the scalar collect)
6669    // is a safe fn, so the body macro's `unsafe` block is inert here. Same cfg
6670    // as `collect_optimal_candidates_initialized_scalar` (absent on
6671    // aarch64-little, where NEON is the baseline tier).
6672    #[cfg(not(all(target_arch = "aarch64", target_endian = "little")))]
6673    #[allow(unused_unsafe)]
6674    fn start_matching_btlazy2_scalar(
6675        &mut self,
6676        mut handle_sequence: impl for<'a> FnMut(Sequence<'a>),
6677    ) {
6678        start_matching_btlazy2_body!(
6679            self,
6680            handle_sequence,
6681            collect_optimal_candidates_initialized_scalar,
6682            crate::encoding::fastpath::scalar::count_match_from_indices
6683        )
6684    }
6685
6686    fn start_matching_optimal<S: super::strategy::Strategy>(
6687        &mut self,
6688        mut handle_sequence: impl for<'a> FnMut(Sequence<'a>),
6689    ) {
6690        self.table.ensure_tables();
6691        // Borrowed-aware: owned → last committed chunk; borrowed → staged
6692        // in-place block range.
6693        let (current_abs_start, current_len) = self.table.current_block_range();
6694        if current_len == 0 {
6695            return;
6696        }
6697        let current_ptr = self.table.get_last_space().as_ptr();
6698        // `start_matching_optimal()` mutates tables/state but never mutates or
6699        // reallocates `self.table.history`, so this tail slice remains valid for
6700        // the duration of the routine and avoids cloning the full block.
6701        let current = unsafe { core::slice::from_raw_parts(current_ptr, current_len) };
6702
6703        let current_abs_end = current_abs_start + current_len;
6704        self.table
6705            .apply_limited_update_after_long_match(current_abs_start);
6706        let hash3_start_cursor = self
6707            .table
6708            .skip_insert_until_abs
6709            .max(self.table.history_abs_start);
6710        self.table
6711            .backfill_boundary_positions(current_abs_start, current_abs_end);
6712        self.table.next_to_update3 = hash3_start_cursor;
6713        // Borrow split: `prepare_ldm_candidates` needs immutable
6714        // access to the live history (the post-`history_start`
6715        // slice of `self.table.history`) while it mutates the LDM
6716        // bucket table owned by `self.backend.bt_mut()`. Both live
6717        // in disjoint fields of `Self`, so we capture the slice +
6718        // its base before reaching for `bt_mut()`.
6719        //
6720        // The producer operates in absolute stream coordinates
6721        // throughout; `live_history[0]` corresponds to absolute
6722        // `history_abs_start` (upstream zstd `base + dictLimit`), and the
6723        // abs→slice translation happens inside the producer at
6724        // each `live_history[..]` access. Passing the full
6725        // `history` Vec would index into the dead prefix (the
6726        // bytes already retired past `history_start`).
6727        let live_history = self.table.live_history();
6728        let history_abs_start = self.table.history_abs_start;
6729        self.backend.bt_mut().prepare_ldm_candidates(
6730            live_history,
6731            history_abs_start,
6732            current_abs_start,
6733            current_len,
6734        );
6735
6736        if self.should_run_btultra2_seed_pass::<S>(current_len) {
6737            self.run_btultra2_seed_pass(current, current_abs_start, current_len);
6738        }
6739
6740        // Const-generic profile selection: every field is folded from
6741        // S's associated consts (MAX_CHAIN_DEPTH /
6742        // SUFFICIENT_MATCH_LEN / ACCURATE_PRICE / FAVOR_SMALL_OFFSETS),
6743        // so the optimiser produces the literal at codegen time
6744        // without a runtime match.
6745        let profile = HcOptimalCostProfile::const_for_strategy::<S>();
6746        let mut opt_state =
6747            core::mem::replace(&mut self.backend.bt_mut().opt_state, HcOptState::new());
6748        opt_state.rescale_freqs(current, profile);
6749        let mut best_plan = core::mem::take(&mut self.backend.bt_mut().opt_segment_plan_scratch);
6750        best_plan.clear();
6751        let mut plan_reps = self.table.offset_hist;
6752        let (mut cursor, mut plan_litlen) =
6753            self.table.opt_start_cursor_and_litlen(current_abs_start);
6754        let mut plan_literals_cursor = 0usize;
6755        let match_loop_limit = current_len.saturating_sub(8);
6756        while cursor < match_loop_limit {
6757            let remaining_len = current_len - cursor;
6758            let segment_abs_start = current_abs_start + cursor;
6759            let segment_start = best_plan.len();
6760            let (_, end_reps, end_litlen, consumed_len) = self.build_optimal_plan::<S>(
6761                &current[cursor..],
6762                segment_abs_start,
6763                remaining_len,
6764                HcOptimalPlanState {
6765                    block_offset: cursor,
6766                    reps: plan_reps,
6767                    litlen: plan_litlen,
6768                    profile,
6769                },
6770                &opt_state,
6771                &mut best_plan,
6772            );
6773            BtMatcher::update_plan_stats_segment(
6774                current,
6775                current_len,
6776                &best_plan[segment_start..],
6777                &mut plan_literals_cursor,
6778                &mut plan_reps,
6779                &mut opt_state,
6780                profile.accurate,
6781            );
6782            plan_reps = end_reps;
6783            plan_litlen = end_litlen;
6784            cursor += consumed_len;
6785        }
6786
6787        self.table
6788            .emit_optimal_plan(current_len, &best_plan, &mut handle_sequence);
6789        best_plan.clear();
6790        self.backend.bt_mut().opt_segment_plan_scratch = best_plan;
6791        self.backend.bt_mut().opt_state = opt_state;
6792    }
6793
6794    fn run_btultra2_seed_pass(
6795        &mut self,
6796        current: &[u8],
6797        current_abs_start: usize,
6798        current_len: usize,
6799    ) {
6800        // The seed pass is BtUltra2-exclusive by name (the only
6801        // caller is `should_run_btultra2_seed_pass`), so pin `S` to
6802        // `BtUltra2` for both the cost-profile lookup and the
6803        // `build_optimal_plan::<S>` call below.
6804        type S = super::strategy::BtUltra2;
6805        let seed_profile = HcOptimalCostProfile::const_for_strategy::<S>();
6806        let mut opt_state =
6807            core::mem::replace(&mut self.backend.bt_mut().opt_state, HcOptState::new());
6808        opt_state.rescale_freqs(current, seed_profile);
6809        let mut seed_reps = self.table.offset_hist;
6810        let (mut cursor, mut seed_litlen) =
6811            self.table.opt_start_cursor_and_litlen(current_abs_start);
6812        let mut seed_literals_cursor = 0usize;
6813        let mut seed_plan = core::mem::take(&mut self.backend.bt_mut().opt_seed_plan_scratch);
6814        seed_plan.clear();
6815        let match_loop_limit = current_len.saturating_sub(8);
6816        while cursor < match_loop_limit {
6817            let remaining_len = current_len - cursor;
6818            let segment_abs_start = current_abs_start + cursor;
6819            let segment_start = seed_plan.len();
6820            let (_, end_reps, end_litlen, consumed_len) = self.build_optimal_plan::<S>(
6821                &current[cursor..],
6822                segment_abs_start,
6823                remaining_len,
6824                HcOptimalPlanState {
6825                    block_offset: cursor,
6826                    reps: seed_reps,
6827                    litlen: seed_litlen,
6828                    profile: seed_profile,
6829                },
6830                &opt_state,
6831                &mut seed_plan,
6832            );
6833            BtMatcher::update_plan_stats_segment(
6834                current,
6835                current_len,
6836                &seed_plan[segment_start..],
6837                &mut seed_literals_cursor,
6838                &mut seed_reps,
6839                &mut opt_state,
6840                seed_profile.accurate,
6841            );
6842            seed_plan.truncate(segment_start);
6843            seed_reps = end_reps;
6844            seed_litlen = end_litlen;
6845            cursor += consumed_len;
6846        }
6847        seed_plan.clear();
6848        self.backend.bt_mut().opt_seed_plan_scratch = seed_plan;
6849        self.backend.bt_mut().opt_state = opt_state;
6850
6851        // Upstream zstd initStats_ultra keeps the collected entropy statistics but
6852        // invalidates the first-pass matchfinder history before the real pass.
6853        self.table.position_base = self.table.history_abs_start;
6854        self.table.index_shift = current_len;
6855        self.table.next_to_update3 = current_abs_start;
6856        self.table.skip_insert_until_abs = current_abs_start;
6857        // Upstream zstd `ZSTD_initStats_ultra()` invalidates the first scan by moving
6858        // `window.base` back by `srcSize`, making the real pass start at
6859        // `curr == srcSize` instead of 0. Position 0 is therefore a valid
6860        // table entry in the second pass even though raw C tables reserve
6861        // value 0 as empty during an unshifted first pass.
6862        self.table.allow_zero_relative_position = true;
6863    }
6864
6865    fn build_optimal_plan<S: super::strategy::Strategy>(
6866        &mut self,
6867        current: &[u8],
6868        current_abs_start: usize,
6869        current_len: usize,
6870        initial_state: HcOptimalPlanState,
6871        stats: &HcOptState,
6872        out: &mut Vec<HcOptimalSequence>,
6873    ) -> (u32, [u32; 3], usize, usize) {
6874        debug_assert!(S::USE_BT, "build_optimal_plan called on non-BT strategy");
6875        debug_assert_eq!(initial_state.profile.accurate, S::ACCURATE_PRICE);
6876        debug_assert_eq!(
6877            initial_state.profile.favor_small_offsets,
6878            S::FAVOR_SMALL_OFFSETS
6879        );
6880        // `S::ACCURATE_PRICE` / `S::FAVOR_SMALL_OFFSETS` cannot appear
6881        // as const-generic arguments yet (`generic_const_exprs` is
6882        // still unstable), so dispatch over a 4-arm match — but on the
6883        // strategy's ASSOCIATED CONSTS, not the runtime profile (the
6884        // `debug_assert_eq`s above pin the runtime profile to those
6885        // consts). A const scrutinee folds the three dead arms at
6886        // monomorphisation; matching the runtime profile instead kept
6887        // all four `#[inline(always)]` DP bodies (~16 KB each) alive in
6888        // EVERY `S` instantiation — ~360 KB of the wasm payload.
6889        match (S::ACCURATE_PRICE, S::FAVOR_SMALL_OFFSETS) {
6890            (true, false) => self.build_optimal_plan_impl::<S, true, false>(
6891                current,
6892                current_abs_start,
6893                current_len,
6894                initial_state,
6895                stats,
6896                out,
6897            ),
6898            (true, true) => self.build_optimal_plan_impl::<S, true, true>(
6899                current,
6900                current_abs_start,
6901                current_len,
6902                initial_state,
6903                stats,
6904                out,
6905            ),
6906            (false, false) => self.build_optimal_plan_impl::<S, false, false>(
6907                current,
6908                current_abs_start,
6909                current_len,
6910                initial_state,
6911                stats,
6912                out,
6913            ),
6914            (false, true) => self.build_optimal_plan_impl::<S, false, true>(
6915                current,
6916                current_abs_start,
6917                current_len,
6918                initial_state,
6919                stats,
6920                out,
6921            ),
6922        }
6923    }
6924
6925    /// Cross-platform DP entry. Picks the kernel-specific variant so the
6926    /// entire optimal-parser DP body (per-position match gathering, price
6927    /// updates, traceback) runs inside a single `target_feature` umbrella
6928    /// alongside the per-position `collect_optimal_candidates_initialized_
6929    /// <kernel>`. This eliminates the final ABI barrier on the hot per-
6930    /// position match-collection call — the level22 critical path is now
6931    /// one straight-line inline chain from DP body down through BT walk
6932    /// and match-length probes.
6933    #[inline(always)]
6934    fn build_optimal_plan_impl<
6935        S: super::strategy::Strategy,
6936        const ACCURATE_PRICE: bool,
6937        const FAVOR_SMALL_OFFSETS: bool,
6938    >(
6939        &mut self,
6940        current: &[u8],
6941        current_abs_start: usize,
6942        current_len: usize,
6943        initial_state: HcOptimalPlanState,
6944        stats: &HcOptState,
6945        out: &mut Vec<HcOptimalSequence>,
6946    ) -> (u32, [u32; 3], usize, usize) {
6947        #[cfg(all(target_arch = "aarch64", target_endian = "little"))]
6948        unsafe {
6949            self.build_optimal_plan_impl_neon::<S, ACCURATE_PRICE, FAVOR_SMALL_OFFSETS>(
6950                current,
6951                current_abs_start,
6952                current_len,
6953                initial_state,
6954                stats,
6955                out,
6956            )
6957        }
6958        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
6959        {
6960            use crate::encoding::fastpath::{FastpathKernel, select_kernel};
6961            match select_kernel() {
6962                FastpathKernel::Avx2Bmi2 => unsafe {
6963                    self.build_optimal_plan_impl_avx2_bmi2::<S, ACCURATE_PRICE, FAVOR_SMALL_OFFSETS>(
6964                        current,
6965                        current_abs_start,
6966                        current_len,
6967                        initial_state,
6968                        stats,
6969                        out,
6970                    )
6971                },
6972                FastpathKernel::Sse42 => unsafe {
6973                    self.build_optimal_plan_impl_sse42::<S, ACCURATE_PRICE, FAVOR_SMALL_OFFSETS>(
6974                        current,
6975                        current_abs_start,
6976                        current_len,
6977                        initial_state,
6978                        stats,
6979                        out,
6980                    )
6981                },
6982                FastpathKernel::Scalar => self
6983                    .build_optimal_plan_impl_scalar::<S, ACCURATE_PRICE, FAVOR_SMALL_OFFSETS>(
6984                        current,
6985                        current_abs_start,
6986                        current_len,
6987                        initial_state,
6988                        stats,
6989                        out,
6990                    ),
6991            }
6992        }
6993        // wasm with simd128: route through the simd128 DP body (4-lane price-set).
6994        #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
6995        unsafe {
6996            self.build_optimal_plan_impl_simd128::<S, ACCURATE_PRICE, FAVOR_SMALL_OFFSETS>(
6997                current,
6998                current_abs_start,
6999                current_len,
7000                initial_state,
7001                stats,
7002                out,
7003            )
7004        }
7005        #[cfg(not(any(
7006            all(target_arch = "aarch64", target_endian = "little"),
7007            target_arch = "x86",
7008            target_arch = "x86_64",
7009            all(target_arch = "wasm32", target_feature = "simd128")
7010        )))]
7011        {
7012            self.build_optimal_plan_impl_scalar::<S, ACCURATE_PRICE, FAVOR_SMALL_OFFSETS>(
7013                current,
7014                current_abs_start,
7015                current_len,
7016                initial_state,
7017                stats,
7018                out,
7019            )
7020        }
7021    }
7022
7023    /// NEON-umbrella DP body. Inlines
7024    /// `collect_optimal_candidates_initialized_neon` (and its entire
7025    /// per-position pipeline) directly into the DP loop.
7026    #[cfg(all(target_arch = "aarch64", target_endian = "little"))]
7027    #[target_feature(enable = "neon")]
7028    unsafe fn build_optimal_plan_impl_neon<
7029        S: super::strategy::Strategy,
7030        const ACCURATE_PRICE: bool,
7031        const FAVOR_SMALL_OFFSETS: bool,
7032    >(
7033        &mut self,
7034        current: &[u8],
7035        current_abs_start: usize,
7036        current_len: usize,
7037        initial_state: HcOptimalPlanState,
7038        stats: &HcOptState,
7039        out: &mut Vec<HcOptimalSequence>,
7040    ) -> (u32, [u32; 3], usize, usize) {
7041        build_optimal_plan_impl_body!(
7042            self,
7043            S,
7044            current,
7045            current_abs_start,
7046            current_len,
7047            initial_state,
7048            stats,
7049            out,
7050            collect_optimal_candidates_initialized_neon,
7051            priceset_range_nonabort_neon,
7052        )
7053    }
7054
7055    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
7056    #[target_feature(enable = "sse4.2")]
7057    unsafe fn build_optimal_plan_impl_sse42<
7058        S: super::strategy::Strategy,
7059        const ACCURATE_PRICE: bool,
7060        const FAVOR_SMALL_OFFSETS: bool,
7061    >(
7062        &mut self,
7063        current: &[u8],
7064        current_abs_start: usize,
7065        current_len: usize,
7066        initial_state: HcOptimalPlanState,
7067        stats: &HcOptState,
7068        out: &mut Vec<HcOptimalSequence>,
7069    ) -> (u32, [u32; 3], usize, usize) {
7070        build_optimal_plan_impl_body!(
7071            self,
7072            S,
7073            current,
7074            current_abs_start,
7075            current_len,
7076            initial_state,
7077            stats,
7078            out,
7079            collect_optimal_candidates_initialized_sse42,
7080            priceset_range_nonabort_sse41,
7081        )
7082    }
7083
7084    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
7085    #[target_feature(enable = "avx2,bmi2")]
7086    unsafe fn build_optimal_plan_impl_avx2_bmi2<
7087        S: super::strategy::Strategy,
7088        const ACCURATE_PRICE: bool,
7089        const FAVOR_SMALL_OFFSETS: bool,
7090    >(
7091        &mut self,
7092        current: &[u8],
7093        current_abs_start: usize,
7094        current_len: usize,
7095        initial_state: HcOptimalPlanState,
7096        stats: &HcOptState,
7097        out: &mut Vec<HcOptimalSequence>,
7098    ) -> (u32, [u32; 3], usize, usize) {
7099        build_optimal_plan_impl_body!(
7100            self,
7101            S,
7102            current,
7103            current_abs_start,
7104            current_len,
7105            initial_state,
7106            stats,
7107            out,
7108            collect_optimal_candidates_initialized_avx2_bmi2,
7109            priceset_range_nonabort_avx2,
7110        )
7111    }
7112
7113    #[cfg(not(all(target_arch = "aarch64", target_endian = "little")))]
7114    // Body macros wrap callees in `unsafe { }` for the NEON/AVX/SSE
7115    // variants where callees are `unsafe fn`. The scalar wrappers route
7116    // through safe fns, so those blocks are redundant on this path.
7117    #[allow(unused_unsafe)]
7118    // The dispatch reaches this only on non-SIMD x86 (Scalar tier) and the
7119    // portable fallback; on wasm+simd128 the simd128 wrapper is selected, so
7120    // this is cfg-dead there.
7121    #[cfg_attr(
7122        all(target_arch = "wasm32", target_feature = "simd128"),
7123        allow(dead_code)
7124    )]
7125    fn build_optimal_plan_impl_scalar<
7126        S: super::strategy::Strategy,
7127        const ACCURATE_PRICE: bool,
7128        const FAVOR_SMALL_OFFSETS: bool,
7129    >(
7130        &mut self,
7131        current: &[u8],
7132        current_abs_start: usize,
7133        current_len: usize,
7134        initial_state: HcOptimalPlanState,
7135        stats: &HcOptState,
7136        out: &mut Vec<HcOptimalSequence>,
7137    ) -> (u32, [u32; 3], usize, usize) {
7138        build_optimal_plan_impl_body!(
7139            self,
7140            S,
7141            current,
7142            current_abs_start,
7143            current_len,
7144            initial_state,
7145            stats,
7146            out,
7147            collect_optimal_candidates_initialized_scalar,
7148            priceset_range_nonabort_scalar,
7149        )
7150    }
7151
7152    /// wasm `simd128`-umbrella DP body: scalar candidate collection (no wasm
7153    /// collect kernel) but the simd128 4-lane price-set.
7154    #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
7155    #[target_feature(enable = "simd128")]
7156    // With `+simd128` in the wasm baseline the shared body macro's `unsafe`
7157    // blocks (needed by the safe scalar wrapper) are redundant inside this
7158    // target_feature fn.
7159    #[allow(unused_unsafe)]
7160    unsafe fn build_optimal_plan_impl_simd128<
7161        S: super::strategy::Strategy,
7162        const ACCURATE_PRICE: bool,
7163        const FAVOR_SMALL_OFFSETS: bool,
7164    >(
7165        &mut self,
7166        current: &[u8],
7167        current_abs_start: usize,
7168        current_len: usize,
7169        initial_state: HcOptimalPlanState,
7170        stats: &HcOptState,
7171        out: &mut Vec<HcOptimalSequence>,
7172    ) -> (u32, [u32; 3], usize, usize) {
7173        build_optimal_plan_impl_body!(
7174            self,
7175            S,
7176            current,
7177            current_abs_start,
7178            current_len,
7179            initial_state,
7180            stats,
7181            out,
7182            collect_optimal_candidates_initialized_scalar,
7183            priceset_range_nonabort_simd128,
7184        )
7185    }
7186
7187    #[cfg(test)]
7188    fn collect_optimal_candidates(
7189        &mut self,
7190        abs_pos: usize,
7191        current_abs_end: usize,
7192        profile: HcOptimalCostProfile,
7193        query: HcCandidateQuery,
7194        out: &mut Vec<MatchCandidate>,
7195    ) {
7196        use super::strategy::{self, StrategyTag};
7197        self.table.ensure_tables();
7198        // Dispatch purely from `self.strategy_tag` (set by
7199        // `configure()`). Tests must configure the matcher the same
7200        // way production does — wiring up `table.hash3_log` directly
7201        // without setting a matching `strategy_tag` is no longer
7202        // allowed.
7203        match self.strategy_tag {
7204            StrategyTag::BtUltra2 => self
7205                .collect_optimal_candidates_initialized::<strategy::BtUltra2, true>(
7206                    abs_pos,
7207                    current_abs_end,
7208                    profile,
7209                    query,
7210                    out,
7211                ),
7212            StrategyTag::BtUltra => self
7213                .collect_optimal_candidates_initialized::<strategy::BtUltra, true>(
7214                    abs_pos,
7215                    current_abs_end,
7216                    profile,
7217                    query,
7218                    out,
7219                ),
7220            StrategyTag::Btlazy2 => self
7221                .collect_optimal_candidates_initialized::<strategy::Btlazy2, true>(
7222                    abs_pos,
7223                    current_abs_end,
7224                    profile,
7225                    query,
7226                    out,
7227                ),
7228            StrategyTag::BtOpt => self
7229                .collect_optimal_candidates_initialized::<strategy::BtOpt, true>(
7230                    abs_pos,
7231                    current_abs_end,
7232                    profile,
7233                    query,
7234                    out,
7235                ),
7236            StrategyTag::Fast | StrategyTag::Dfast | StrategyTag::Greedy | StrategyTag::Lazy => {
7237                self.collect_optimal_candidates_initialized::<strategy::Lazy, false>(
7238                    abs_pos,
7239                    current_abs_end,
7240                    profile,
7241                    query,
7242                    out,
7243                )
7244            }
7245        }
7246    }
7247
7248    /// Cross-platform entry. Picks the kernel-specific variant so the per-
7249    /// position pipeline (BT-tree fill, rep probing, hash3 probing, BT
7250    /// collect / HC chain walk) runs inside a single `target_feature`
7251    /// umbrella — all inner SIMD probes inline without ABI barriers.
7252    ///
7253    /// The on-encode hot path bypasses this dispatcher: `build_optimal_plan_impl_<kernel>`
7254    /// calls the matching `_<kernel>` variant directly. This entry is kept
7255    /// for the cfg(test)-only `collect_optimal_candidates` shim and any
7256    /// future caller that isn't already inside a kernel umbrella.
7257    #[allow(dead_code)]
7258    #[inline(always)]
7259    fn collect_optimal_candidates_initialized<
7260        S: super::strategy::Strategy,
7261        const USE_BT_MATCHFINDER: bool,
7262    >(
7263        &mut self,
7264        abs_pos: usize,
7265        current_abs_end: usize,
7266        profile: HcOptimalCostProfile,
7267        query: HcCandidateQuery,
7268        out: &mut Vec<MatchCandidate>,
7269    ) {
7270        #[cfg(all(target_arch = "aarch64", target_endian = "little"))]
7271        unsafe {
7272            self.collect_optimal_candidates_initialized_neon::<S, USE_BT_MATCHFINDER>(
7273                abs_pos,
7274                current_abs_end,
7275                profile,
7276                query,
7277                out,
7278            )
7279        }
7280        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
7281        {
7282            use crate::encoding::fastpath::{FastpathKernel, select_kernel};
7283            match select_kernel() {
7284                FastpathKernel::Avx2Bmi2 => unsafe {
7285                    self.collect_optimal_candidates_initialized_avx2_bmi2::<S, USE_BT_MATCHFINDER>(
7286                        abs_pos,
7287                        current_abs_end,
7288                        profile,
7289                        query,
7290                        out,
7291                    )
7292                },
7293                FastpathKernel::Sse42 => unsafe {
7294                    self.collect_optimal_candidates_initialized_sse42::<S, USE_BT_MATCHFINDER>(
7295                        abs_pos,
7296                        current_abs_end,
7297                        profile,
7298                        query,
7299                        out,
7300                    )
7301                },
7302                FastpathKernel::Scalar => self
7303                    .collect_optimal_candidates_initialized_scalar::<S, USE_BT_MATCHFINDER>(
7304                        abs_pos,
7305                        current_abs_end,
7306                        profile,
7307                        query,
7308                        out,
7309                    ),
7310            }
7311        }
7312        #[cfg(not(any(
7313            all(target_arch = "aarch64", target_endian = "little"),
7314            target_arch = "x86",
7315            target_arch = "x86_64"
7316        )))]
7317        {
7318            self.collect_optimal_candidates_initialized_scalar::<S, USE_BT_MATCHFINDER>(
7319                abs_pos,
7320                current_abs_end,
7321                profile,
7322                query,
7323                out,
7324            )
7325        }
7326    }
7327
7328    /// NEON-umbrella variant. Every inner helper (`bt_update_tree_until_neon`,
7329    /// `for_each_repcode_candidate_with_reps_neon`, `hash3_candidate_neon`,
7330    /// `bt_insert_and_collect_matches_neon`, `fastpath::neon::
7331    /// common_prefix_len_ptr`) shares the NEON umbrella so the per-position
7332    /// pipeline executes as a single straight-line inline sequence.
7333    #[cfg(all(target_arch = "aarch64", target_endian = "little"))]
7334    #[target_feature(enable = "neon")]
7335    unsafe fn collect_optimal_candidates_initialized_neon<
7336        S: super::strategy::Strategy,
7337        const USE_BT_MATCHFINDER: bool,
7338    >(
7339        &mut self,
7340        abs_pos: usize,
7341        current_abs_end: usize,
7342        profile: HcOptimalCostProfile,
7343        query: HcCandidateQuery,
7344        out: &mut Vec<MatchCandidate>,
7345    ) {
7346        collect_optimal_candidates_initialized_body!(
7347            self,
7348            S,
7349            abs_pos,
7350            current_abs_end,
7351            profile,
7352            query,
7353            out,
7354            USE_BT_MATCHFINDER,
7355            bt_update_tree_until_neon,
7356            bt_insert_and_collect_matches_neon,
7357            for_each_repcode_candidate_with_reps_neon,
7358            hash3_candidate_neon,
7359            crate::encoding::fastpath::neon::common_prefix_len_ptr,
7360        )
7361    }
7362
7363    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
7364    #[target_feature(enable = "sse4.2")]
7365    unsafe fn collect_optimal_candidates_initialized_sse42<
7366        S: super::strategy::Strategy,
7367        const USE_BT_MATCHFINDER: bool,
7368    >(
7369        &mut self,
7370        abs_pos: usize,
7371        current_abs_end: usize,
7372        profile: HcOptimalCostProfile,
7373        query: HcCandidateQuery,
7374        out: &mut Vec<MatchCandidate>,
7375    ) {
7376        collect_optimal_candidates_initialized_body!(
7377            self,
7378            S,
7379            abs_pos,
7380            current_abs_end,
7381            profile,
7382            query,
7383            out,
7384            USE_BT_MATCHFINDER,
7385            bt_update_tree_until_sse42,
7386            bt_insert_and_collect_matches_sse42,
7387            for_each_repcode_candidate_with_reps_sse42,
7388            hash3_candidate_sse42,
7389            crate::encoding::fastpath::sse42::common_prefix_len_ptr,
7390        )
7391    }
7392
7393    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
7394    #[target_feature(enable = "avx2,bmi2")]
7395    unsafe fn collect_optimal_candidates_initialized_avx2_bmi2<
7396        S: super::strategy::Strategy,
7397        const USE_BT_MATCHFINDER: bool,
7398    >(
7399        &mut self,
7400        abs_pos: usize,
7401        current_abs_end: usize,
7402        profile: HcOptimalCostProfile,
7403        query: HcCandidateQuery,
7404        out: &mut Vec<MatchCandidate>,
7405    ) {
7406        collect_optimal_candidates_initialized_body!(
7407            self,
7408            S,
7409            abs_pos,
7410            current_abs_end,
7411            profile,
7412            query,
7413            out,
7414            USE_BT_MATCHFINDER,
7415            bt_update_tree_until_avx2_bmi2,
7416            bt_insert_and_collect_matches_avx2_bmi2,
7417            for_each_repcode_candidate_with_reps_avx2_bmi2,
7418            hash3_candidate_avx2_bmi2,
7419            crate::encoding::fastpath::avx2_bmi2::common_prefix_len_ptr,
7420        )
7421    }
7422
7423    #[cfg(not(all(target_arch = "aarch64", target_endian = "little")))]
7424    // Macro emits `unsafe { }` wrappers for NEON/AVX/SSE variants; scalar
7425    // callees are safe so the blocks are redundant here only.
7426    #[allow(unused_unsafe)]
7427    fn collect_optimal_candidates_initialized_scalar<
7428        S: super::strategy::Strategy,
7429        const USE_BT_MATCHFINDER: bool,
7430    >(
7431        &mut self,
7432        abs_pos: usize,
7433        current_abs_end: usize,
7434        profile: HcOptimalCostProfile,
7435        query: HcCandidateQuery,
7436        out: &mut Vec<MatchCandidate>,
7437    ) {
7438        collect_optimal_candidates_initialized_body!(
7439            self,
7440            S,
7441            abs_pos,
7442            current_abs_end,
7443            profile,
7444            query,
7445            out,
7446            USE_BT_MATCHFINDER,
7447            bt_update_tree_until_scalar,
7448            bt_insert_and_collect_matches_scalar,
7449            for_each_repcode_candidate_with_reps_scalar,
7450            hash3_candidate_scalar,
7451            crate::encoding::fastpath::scalar::common_prefix_len_ptr,
7452        )
7453    }
7454}
7455
7456#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
7457#[test]
7458fn matches() {
7459    let mut matcher = MatchGenerator::new(1000);
7460    let mut original_data = Vec::new();
7461    let mut reconstructed = Vec::new();
7462
7463    let replay_sequence = |seq: Sequence<'_>, reconstructed: &mut Vec<u8>| match seq {
7464        Sequence::Literals { literals } => {
7465            assert!(!literals.is_empty());
7466            reconstructed.extend_from_slice(literals);
7467        }
7468        Sequence::Triple {
7469            literals,
7470            offset,
7471            match_len,
7472        } => {
7473            assert!(offset > 0);
7474            assert!(match_len >= MIN_MATCH_LEN);
7475            reconstructed.extend_from_slice(literals);
7476            assert!(offset <= reconstructed.len());
7477            let start = reconstructed.len() - offset;
7478            for i in 0..match_len {
7479                let byte = reconstructed[start + i];
7480                reconstructed.push(byte);
7481            }
7482        }
7483    };
7484
7485    matcher.add_data(
7486        alloc::vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
7487        SuffixStore::with_capacity(100),
7488        |_, _| {},
7489    );
7490    original_data.extend_from_slice(&[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
7491
7492    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7493
7494    assert!(!matcher.next_sequence(|_| {}));
7495
7496    matcher.add_data(
7497        alloc::vec![
7498            1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 0, 0, 0, 0, 0,
7499        ],
7500        SuffixStore::with_capacity(100),
7501        |_, _| {},
7502    );
7503    original_data.extend_from_slice(&[
7504        1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 0, 0, 0, 0, 0,
7505    ]);
7506
7507    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7508    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7509    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7510    assert!(!matcher.next_sequence(|_| {}));
7511
7512    matcher.add_data(
7513        alloc::vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 0, 0, 0, 0],
7514        SuffixStore::with_capacity(100),
7515        |_, _| {},
7516    );
7517    original_data.extend_from_slice(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 0, 0, 0, 0]);
7518
7519    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7520    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7521    assert!(!matcher.next_sequence(|_| {}));
7522
7523    matcher.add_data(
7524        alloc::vec![0, 0, 0, 0, 0],
7525        SuffixStore::with_capacity(100),
7526        |_, _| {},
7527    );
7528    original_data.extend_from_slice(&[0, 0, 0, 0, 0]);
7529
7530    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7531    assert!(!matcher.next_sequence(|_| {}));
7532
7533    matcher.add_data(
7534        alloc::vec![7, 8, 9, 10, 11],
7535        SuffixStore::with_capacity(100),
7536        |_, _| {},
7537    );
7538    original_data.extend_from_slice(&[7, 8, 9, 10, 11]);
7539
7540    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7541    assert!(!matcher.next_sequence(|_| {}));
7542
7543    matcher.add_data(
7544        alloc::vec![1, 3, 5, 7, 9],
7545        SuffixStore::with_capacity(100),
7546        |_, _| {},
7547    );
7548    matcher.skip_matching();
7549    original_data.extend_from_slice(&[1, 3, 5, 7, 9]);
7550    reconstructed.extend_from_slice(&[1, 3, 5, 7, 9]);
7551    assert!(!matcher.next_sequence(|_| {}));
7552
7553    matcher.add_data(
7554        alloc::vec![1, 3, 5, 7, 9],
7555        SuffixStore::with_capacity(100),
7556        |_, _| {},
7557    );
7558    original_data.extend_from_slice(&[1, 3, 5, 7, 9]);
7559
7560    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7561    assert!(!matcher.next_sequence(|_| {}));
7562
7563    matcher.add_data(
7564        alloc::vec![0, 0, 11, 13, 15, 17, 20, 11, 13, 15, 17, 20, 21, 23],
7565        SuffixStore::with_capacity(100),
7566        |_, _| {},
7567    );
7568    original_data.extend_from_slice(&[0, 0, 11, 13, 15, 17, 20, 11, 13, 15, 17, 20, 21, 23]);
7569
7570    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7571    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7572    assert!(!matcher.next_sequence(|_| {}));
7573
7574    assert_eq!(reconstructed, original_data);
7575}
7576
7577#[test]
7578fn dfast_matches_roundtrip_multi_block_pattern() {
7579    let pattern = [9, 21, 44, 184, 19, 96, 171, 109, 141, 251];
7580    let first_block: Vec<u8> = pattern.iter().copied().cycle().take(128 * 1024).collect();
7581    let second_block: Vec<u8> = pattern.iter().copied().cycle().take(128 * 1024).collect();
7582
7583    let mut matcher = DfastMatchGenerator::new(1 << 22);
7584    let replay_sequence = |decoded: &mut Vec<u8>, seq: Sequence<'_>| match seq {
7585        Sequence::Literals { literals } => decoded.extend_from_slice(literals),
7586        Sequence::Triple {
7587            literals,
7588            offset,
7589            match_len,
7590        } => {
7591            decoded.extend_from_slice(literals);
7592            let start = decoded.len() - offset;
7593            for i in 0..match_len {
7594                let byte = decoded[start + i];
7595                decoded.push(byte);
7596            }
7597        }
7598    };
7599
7600    matcher.add_data(first_block.clone(), |_| {});
7601    let mut history = Vec::new();
7602    matcher.start_matching(|seq| replay_sequence(&mut history, seq));
7603    assert_eq!(history, first_block);
7604
7605    matcher.add_data(second_block.clone(), |_| {});
7606    let prefix_len = history.len();
7607    matcher.start_matching(|seq| replay_sequence(&mut history, seq));
7608
7609    assert_eq!(&history[prefix_len..], second_block.as_slice());
7610}
7611
7612/// Regression for the `DFAST_MIN_MATCH_LEN: 6 -> 5` drop. The fixture
7613/// is built so the longest available match is EXACTLY 5 bytes — a
7614/// matcher that still effectively requires a 6-byte floor would emit
7615/// only literals here and the assertion would catch the silent
7616/// 5-byte miss.
7617///
7618/// Fixture layout (34 B):
7619///   bytes 0..5    `"ABCDE"`  — match source
7620///   bytes 5..28   `'!'` × 23 — filler that does NOT start with 'A'
7621///   bytes 28..33  `"ABCDE"`  — match site (repeats the prefix)
7622///   byte  33      `'F'`      — terminator: differs from byte 5 (`'!'`),
7623///                              so the forward extension at the match
7624///                              site stops at exactly length 5.
7625///
7626/// A 5-byte match at offset 28 must be emitted; a 6-byte+ match at the
7627/// same offset must NOT.
7628#[test]
7629fn dfast_accepts_exact_five_byte_match() {
7630    // Layout the input so that:
7631    //   byte  0      = 'Z'            (lead byte — keeps the match SOURCE off
7632    //                                  position 0, which the greedy loop never
7633    //                                  inserts: like the upstream zstd it starts the
7634    //                                  cursor at ip+1 and hashes only visited
7635    //                                  positions)
7636    //   bytes 1..6   = "ABCDE"        (the match source — position 1 IS visited)
7637    //   bytes 6..29  = 23 filler bytes that do NOT start with 'A'
7638    //   bytes 29..34 = "ABCDE"        (the 5-byte match site)
7639    //   byte  34     = 'F'            (differs from byte 6 = '!')
7640    // The longest available copy at position 29 is exactly 5 bytes:
7641    // the byte at position 34 ('F') differs from the byte at position 6
7642    // ('!'), so the forward extension stops at length 5.
7643    let mut data = Vec::new();
7644    data.push(b'Z'); // 0
7645    data.extend_from_slice(b"ABCDE"); // 1..6
7646    data.extend_from_slice(b"!!!!!!!!!!!!!!!!!!!!!!!"); // 6..29 (23 bytes)
7647    data.extend_from_slice(b"ABCDE"); // 29..34
7648    data.push(b'F'); // 34: forces forward extension to stop at length 5
7649    // Trailing filler so the match site (29) sits at least HASH_READ_SIZE (8)
7650    // bytes before the block end. The greedy double-fast — like the upstream zstd —
7651    // stops probing at `ilimit = iend - HASH_READ_SIZE`, so a match in the
7652    // final 8 bytes is never searched (upstream zstd parity, not a regression).
7653    data.extend_from_slice(b"GHIJKLMNOPQRSTUVWXYZ"); // 35..55
7654    assert_eq!(data.len(), 55);
7655
7656    let mut matcher = DfastMatchGenerator::new(1 << 22);
7657    matcher.add_data(data.clone(), |_| {});
7658
7659    let mut saw_five_byte_match = false;
7660    let mut saw_longer_match = false;
7661    matcher.start_matching(|seq| {
7662        if let Sequence::Triple {
7663            offset, match_len, ..
7664        } = seq
7665        {
7666            if offset == 28 && match_len == 5 {
7667                saw_five_byte_match = true;
7668            } else if offset == 28 && match_len > 5 {
7669                saw_longer_match = true;
7670            }
7671        }
7672    });
7673
7674    assert!(
7675        saw_five_byte_match,
7676        "dfast must accept the exact-5-byte match — a 6-byte floor would skip it"
7677    );
7678    assert!(
7679        !saw_longer_match,
7680        "fixture pinned to length 5 — byte 33 ('F') must terminate the extension"
7681    );
7682}
7683
7684#[test]
7685fn driver_switches_backends_and_initializes_dfast_via_reset() {
7686    let mut driver = MatchGeneratorDriver::new(32, 2);
7687
7688    driver.reset(CompressionLevel::Default);
7689    assert_eq!(driver.active_backend(), super::strategy::BackendTag::Dfast);
7690    assert_eq!(driver.window_size(), (1u64 << 21));
7691
7692    let mut first = driver.get_next_space();
7693    first[..12].copy_from_slice(b"abcabcabcabc");
7694    first.truncate(12);
7695    driver.commit_space(first);
7696    assert_eq!(driver.get_last_space(), b"abcabcabcabc");
7697    driver.skip_matching_with_hint(None);
7698
7699    let mut second = driver.get_next_space();
7700    second[..12].copy_from_slice(b"abcabcabcabc");
7701    second.truncate(12);
7702    driver.commit_space(second);
7703
7704    let mut reconstructed = b"abcabcabcabc".to_vec();
7705    driver.start_matching(|seq| match seq {
7706        Sequence::Literals { literals } => reconstructed.extend_from_slice(literals),
7707        Sequence::Triple {
7708            literals,
7709            offset,
7710            match_len,
7711        } => {
7712            reconstructed.extend_from_slice(literals);
7713            let start = reconstructed.len() - offset;
7714            for i in 0..match_len {
7715                let byte = reconstructed[start + i];
7716                reconstructed.push(byte);
7717            }
7718        }
7719    });
7720    assert_eq!(reconstructed, b"abcabcabcabcabcabcabcabc");
7721
7722    driver.reset(CompressionLevel::Fastest);
7723    assert_eq!(driver.window_size(), (1u64 << 19));
7724}
7725
7726#[test]
7727fn driver_level5_selects_row_backend() {
7728    let mut driver = MatchGeneratorDriver::new(32, 2);
7729    driver.reset(CompressionLevel::Level(5));
7730    assert_eq!(driver.active_backend(), super::strategy::BackendTag::Row);
7731    // Greedy-specific routing assertion: `MatchGeneratorDriver::start_matching`
7732    // dispatches the Row backend into `start_matching_greedy` iff
7733    // `self.parse == ParseMode::Greedy`, so assert that actual selector —
7734    // round-trip alone passes on the lazy parser too. `row_matcher().lazy_depth`
7735    // is a secondary corroboration of the same routing decision (a mirror of
7736    // the parse mode); checking `parse` directly catches a regression even if
7737    // the two ever drift apart.
7738    assert_eq!(
7739        driver.parse,
7740        super::strategy::ParseMode::Greedy,
7741        "L5 must route to start_matching_greedy (parse == Greedy)",
7742    );
7743    assert_eq!(
7744        driver.row_matcher().lazy_depth,
7745        0,
7746        "row matcher lazy_depth must mirror the greedy parse mode",
7747    );
7748}
7749
7750/// Level 4 maps to `StrategyTag::Dfast` (the greedy double-fast, upstream zstd
7751/// `ZSTD_dfast` — "greedy" is the parse discipline, not the Row/Greedy
7752/// strategy at Level 5). Round-trip alone doesn't pin match quality (a lazy
7753/// parser would also reconstruct the input correctly), so this test guards the
7754/// parse output itself: a small repeating pattern must produce at least one
7755/// `Sequence::Triple`, so a future regression that emits literals-only (e.g. a
7756/// `min_match` or rep-probe guard regression) is caught.
7757#[test]
7758fn driver_level4_greedy_round_trip_single_slice() {
7759    let mut driver = MatchGeneratorDriver::new(64, 2);
7760    driver.reset(CompressionLevel::Level(4));
7761    let input = b"abcdefgh_abcdefgh_abcdefgh_abcdefgh";
7762    let mut space = driver.get_next_space();
7763    space[..input.len()].copy_from_slice(input);
7764    space.truncate(input.len());
7765    driver.commit_space(space);
7766
7767    let mut reconstructed: Vec<u8> = Vec::new();
7768    let mut saw_triple = false;
7769    driver.start_matching(|seq| match seq {
7770        Sequence::Literals { literals } => reconstructed.extend_from_slice(literals),
7771        Sequence::Triple {
7772            literals,
7773            offset,
7774            match_len,
7775        } => {
7776            saw_triple = true;
7777            reconstructed.extend_from_slice(literals);
7778            let start = reconstructed.len() - offset;
7779            for i in 0..match_len {
7780                let byte = reconstructed[start + i];
7781                reconstructed.push(byte);
7782            }
7783        }
7784    });
7785    assert_eq!(
7786        reconstructed,
7787        input.to_vec(),
7788        "L4 greedy parse failed to reconstruct repeating-pattern input",
7789    );
7790    assert!(
7791        saw_triple,
7792        "L4 greedy parse on a repeating pattern must emit at least one match (Triple)",
7793    );
7794}
7795
7796#[test]
7797fn driver_level4_greedy_round_trip_cross_slice() {
7798    // Verifies that the greedy parse carries repcode / hash-table state
7799    // across slice boundaries: the second slice repeats the first byte
7800    // for byte, so the parse must pick up matches reaching back into
7801    // the previous slice's history.
7802    let mut driver = MatchGeneratorDriver::new(32, 4);
7803    driver.reset(CompressionLevel::Level(4));
7804    let chunk = b"the quick brown fox jumps over!!";
7805    assert_eq!(chunk.len(), 32);
7806
7807    let mut first = driver.get_next_space();
7808    first[..chunk.len()].copy_from_slice(chunk);
7809    first.truncate(chunk.len());
7810    driver.commit_space(first);
7811
7812    let mut first_recon: Vec<u8> = Vec::new();
7813    driver.start_matching(|seq| match seq {
7814        Sequence::Literals { literals } => first_recon.extend_from_slice(literals),
7815        Sequence::Triple {
7816            literals,
7817            offset,
7818            match_len,
7819        } => {
7820            first_recon.extend_from_slice(literals);
7821            let start = first_recon.len() - offset;
7822            for i in 0..match_len {
7823                let byte = first_recon[start + i];
7824                first_recon.push(byte);
7825            }
7826        }
7827    });
7828    assert_eq!(
7829        first_recon,
7830        chunk.to_vec(),
7831        "first slice failed to round-trip"
7832    );
7833
7834    let mut second = driver.get_next_space();
7835    second[..chunk.len()].copy_from_slice(chunk);
7836    second.truncate(chunk.len());
7837    driver.commit_space(second);
7838
7839    let mut full = first_recon.clone();
7840    let mut saw_cross_slice_match = false;
7841    driver.start_matching(|seq| match seq {
7842        Sequence::Literals { literals } => full.extend_from_slice(literals),
7843        Sequence::Triple {
7844            literals,
7845            offset,
7846            match_len,
7847        } => {
7848            // A match whose offset reaches >= the current slice's literal
7849            // run plus the second slice's index means we matched into the
7850            // first slice — exactly the cross-slice behavior under test.
7851            if offset >= chunk.len() {
7852                saw_cross_slice_match = true;
7853            }
7854            full.extend_from_slice(literals);
7855            let start = full.len() - offset;
7856            for i in 0..match_len {
7857                let byte = full[start + i];
7858                full.push(byte);
7859            }
7860        }
7861    });
7862    let mut expected = chunk.to_vec();
7863    expected.extend_from_slice(chunk);
7864    assert_eq!(
7865        full, expected,
7866        "cross-slice L4 greedy parse failed to reconstruct"
7867    );
7868    assert!(
7869        saw_cross_slice_match,
7870        "L4 greedy parse must match across slice boundaries (history is shared)",
7871    );
7872}
7873
7874/// Helper: round-trip `data` through the L4 greedy parse and assert
7875/// the reconstructed bytes match. Returns `(triple_count, max_offset)`
7876/// so callers can probe parse shape (matches emitted, max-offset).
7877#[cfg(test)]
7878impl MatchGeneratorDriver {
7879    /// Test-only: stage a parse×search recipe override applied on the
7880    /// next `reset()`. Routes a level through a non-default (parse,
7881    /// search) pair so the decoupling can be exercised end-to-end.
7882    pub(crate) fn set_config_override(
7883        &mut self,
7884        search: super::strategy::SearchMethod,
7885        parse: super::strategy::ParseMode,
7886    ) {
7887        self.config_override = Some((search, parse));
7888    }
7889
7890    /// Test-only: reset `level` routed onto the lazy HashChain pairing.
7891    /// The lazy band runs on the Row backend in production, so HC-specific
7892    /// behaviour (live-chain dict prime, eviction budget accounting, seed
7893    /// pass gates) is exercised through this override-backed reset.
7894    pub(crate) fn reset_on_hc_lazy(&mut self, level: CompressionLevel) {
7895        self.set_config_override(
7896            super::strategy::SearchMethod::HashChain,
7897            super::strategy::ParseMode::Lazy2,
7898        );
7899        self.reset(level);
7900    }
7901}
7902
7903/// Drive a full compress parse for `data` at `level` (optionally with a
7904/// parse×search override) and reconstruct the bytes from the emitted
7905/// sequences. The returned buffer must equal `data` for a correct parse.
7906#[cfg(test)]
7907fn drive_roundtrip_with_override(
7908    level: CompressionLevel,
7909    over: Option<(super::strategy::SearchMethod, super::strategy::ParseMode)>,
7910    data: &[u8],
7911) -> Vec<u8> {
7912    let mut driver = MatchGeneratorDriver::new(1 << 17, 8);
7913    if let Some((s, p)) = over {
7914        driver.set_config_override(s, p);
7915    }
7916    driver.reset(level);
7917
7918    let mut out: Vec<u8> = Vec::with_capacity(data.len());
7919    let mut offset_in_data = 0usize;
7920    while offset_in_data < data.len() {
7921        let mut space = driver.get_next_space();
7922        let take = (data.len() - offset_in_data).min(space.len());
7923        space[..take].copy_from_slice(&data[offset_in_data..offset_in_data + take]);
7924        space.truncate(take);
7925        driver.commit_space(space);
7926        offset_in_data += take;
7927
7928        driver.start_matching(|seq| match seq {
7929            Sequence::Literals { literals } => out.extend_from_slice(literals),
7930            Sequence::Triple {
7931                literals,
7932                offset,
7933                match_len,
7934            } => {
7935                out.extend_from_slice(literals);
7936                let start = out.len() - offset;
7937                for i in 0..match_len {
7938                    let byte = out[start + i];
7939                    out.push(byte);
7940                }
7941            }
7942        });
7943    }
7944    out
7945}
7946
7947/// Phase 1 capability proof: parse and search are decoupled, so a level
7948/// can run any parse mode on any non-opt search backend. Greedy-on-
7949/// HashChain and Lazy2-on-RowHash are pairings the legacy `strategy_tag`
7950/// could not express; both must reconstruct the input exactly.
7951#[test]
7952fn parse_search_matrix_decoupled_roundtrips() {
7953    use super::strategy::{ParseMode, SearchMethod};
7954    // Mixed repetitive + literal payload that exercises matches and reps.
7955    let mut data = Vec::new();
7956    for i in 0..4000u32 {
7957        data.extend_from_slice(b"the quick brown fox ");
7958        data.extend_from_slice(&i.to_le_bytes());
7959    }
7960
7961    // Greedy parse on the HashChain search backend (legacy: Greedy was
7962    // welded to RowHash).
7963    let got = drive_roundtrip_with_override(
7964        CompressionLevel::Level(5),
7965        Some((SearchMethod::HashChain, ParseMode::Greedy)),
7966        &data,
7967    );
7968    assert_eq!(got, data, "greedy-on-hashchain diverged");
7969
7970    // Lazy2 parse on the RowHash search backend (legacy: Lazy was welded
7971    // to HashChain).
7972    let got = drive_roundtrip_with_override(
7973        CompressionLevel::Level(8),
7974        Some((SearchMethod::RowHash, ParseMode::Lazy2)),
7975        &data,
7976    );
7977    assert_eq!(got, data, "lazy2-on-rowhash diverged");
7978
7979    // Lazy on RowHash too (depth 1).
7980    let got = drive_roundtrip_with_override(
7981        CompressionLevel::Level(6),
7982        Some((SearchMethod::RowHash, ParseMode::Lazy)),
7983        &data,
7984    );
7985    assert_eq!(got, data, "lazy-on-rowhash diverged");
7986}
7987
7988/// The row `mls` knob (C-like `minMatch`) is respected: every accepted
7989/// match (regular row + repcode, on the lazy parse) is at least `mls`
7990/// bytes, and the stream still round-trips for the whole 4..=7 range. The
7991/// default (5) reproduces the historical `ROW_MIN_MATCH_LEN` behaviour.
7992#[test]
7993fn row_mls_knob_gates_matches_and_roundtrips() {
7994    let data: Vec<u8> = (0..4000u32)
7995        .flat_map(|i| {
7996            let mut v = b"abcdefgh".to_vec();
7997            v.extend_from_slice(&i.to_le_bytes());
7998            v
7999        })
8000        .collect();
8001
8002    for mls in [4usize, 5, 6, 7] {
8003        let mut matcher = RowMatchGenerator::new(1 << 22);
8004        let mut cfg = ROW_CONFIG;
8005        cfg.mls = mls;
8006        matcher.configure(cfg);
8007        matcher.add_data(data.clone(), |_| {});
8008
8009        let mut out: Vec<u8> = Vec::with_capacity(data.len());
8010        let mut shortest_match = usize::MAX;
8011        matcher.start_matching(|seq| match seq {
8012            Sequence::Literals { literals } => out.extend_from_slice(literals),
8013            Sequence::Triple {
8014                literals,
8015                offset,
8016                match_len,
8017            } => {
8018                out.extend_from_slice(literals);
8019                shortest_match = shortest_match.min(match_len);
8020                let start = out.len() - offset;
8021                for i in 0..match_len {
8022                    let byte = out[start + i];
8023                    out.push(byte);
8024                }
8025            }
8026        });
8027
8028        assert_eq!(out, data, "mls={mls} round-trip diverged");
8029        if shortest_match != usize::MAX {
8030            assert!(
8031                shortest_match >= mls,
8032                "mls={mls}: emitted a {shortest_match}-byte match below the floor",
8033            );
8034        }
8035    }
8036}
8037
8038/// `LevelParams::parse()` derives the parse mode from the `search` axis, not
8039/// the strategy tag, so the decoupling holds even for a `Bt*`-tagged level
8040/// overridden to a non-BT search backend. Pre-fix the method matched on
8041/// `strategy_tag` and returned `Optimal` for any `Bt*` tag regardless of
8042/// `search`/`lazy_depth`.
8043#[test]
8044fn parse_mode_follows_search_axis_not_strategy_tag() {
8045    use super::strategy::{ParseMode, SearchMethod};
8046    // LEVEL_TABLE[15] is level 16: BtOpt tag, BinaryTree search.
8047    let mut p = LEVEL_TABLE[15];
8048    assert_eq!(p.parse(), ParseMode::Optimal, "BinaryTree search → Optimal");
8049    // Override the Bt-tagged level's search to a non-BT backend: parse must
8050    // follow the search axis (derive from lazy_depth), not stay Optimal.
8051    p.search = SearchMethod::RowHash;
8052    p.lazy_depth = 0;
8053    assert_eq!(p.parse(), ParseMode::Greedy, "RowHash + depth 0 → Greedy");
8054    p.lazy_depth = 2;
8055    assert_eq!(p.parse(), ParseMode::Lazy2, "RowHash + depth 2 → Lazy2");
8056}
8057
8058/// The test-only `config_override` is consumed by the first `reset()` (one
8059/// shot), so a reused driver does not silently keep the synthetic pairing
8060/// armed across later resets. Pre-fix `reset()` copied the override and left
8061/// it set.
8062#[test]
8063fn config_override_is_consumed_by_reset() {
8064    use super::strategy::{ParseMode, SearchMethod};
8065    let mut driver = MatchGeneratorDriver::new(1 << 17, 8);
8066    driver.set_config_override(SearchMethod::RowHash, ParseMode::Lazy2);
8067    assert!(driver.config_override.is_some());
8068    driver.reset(CompressionLevel::Level(5));
8069    assert!(
8070        driver.config_override.is_none(),
8071        "override must be consumed after one reset",
8072    );
8073}
8074
8075// Level 4 maps to the greedy Dfast (double-fast) backend — "greedy" here is the
8076// parse discipline (no lazy lookahead, upstream zstd `ZSTD_dfast`), NOT the Row/Greedy
8077// strategy (which is Level 5). This roundtrip is intentional Dfast L4 coverage;
8078// the Row backend is exercised by the `Level(5)` fixtures elsewhere in this file.
8079#[cfg(test)]
8080fn l4_greedy_round_trip(slice_size: usize, max_slices: usize, data: &[u8]) -> (usize, usize) {
8081    let mut driver = MatchGeneratorDriver::new(slice_size, max_slices);
8082    driver.reset(CompressionLevel::Level(4));
8083
8084    let mut reconstructed: Vec<u8> = Vec::with_capacity(data.len());
8085    let mut triple_count = 0usize;
8086    let mut max_offset = 0usize;
8087
8088    // `start_matching` consumes the current pending slice; multi-slice
8089    // payloads require commit + drive per slice so earlier slices'
8090    // bytes actually round-trip out before they're displaced from the
8091    // window.
8092    let mut offset_in_data = 0usize;
8093    while offset_in_data < data.len() {
8094        let mut space = driver.get_next_space();
8095        let space_cap = space.len();
8096        let take = (data.len() - offset_in_data).min(space_cap);
8097        space[..take].copy_from_slice(&data[offset_in_data..offset_in_data + take]);
8098        space.truncate(take);
8099        driver.commit_space(space);
8100        offset_in_data += take;
8101
8102        driver.start_matching(|seq| match seq {
8103            Sequence::Literals { literals } => reconstructed.extend_from_slice(literals),
8104            Sequence::Triple {
8105                literals,
8106                offset,
8107                match_len,
8108            } => {
8109                triple_count += 1;
8110                if offset > max_offset {
8111                    max_offset = offset;
8112                }
8113                reconstructed.extend_from_slice(literals);
8114                let start = reconstructed.len() - offset;
8115                for i in 0..match_len {
8116                    let byte = reconstructed[start + i];
8117                    reconstructed.push(byte);
8118                }
8119            }
8120        });
8121    }
8122
8123    // Empty payload still needs one commit/drive round so the empty-
8124    // input path of `start_matching_greedy` (the `current_len == 0`
8125    // early-return guard) gets exercised.
8126    if data.is_empty() {
8127        let mut space = driver.get_next_space();
8128        space.truncate(0);
8129        driver.commit_space(space);
8130        driver.start_matching(|seq| match seq {
8131            Sequence::Literals { literals } => reconstructed.extend_from_slice(literals),
8132            Sequence::Triple { .. } => panic!("empty input must not emit any matches"),
8133        });
8134    }
8135
8136    assert_eq!(reconstructed, data, "L4 greedy round-trip diverged");
8137    (triple_count, max_offset)
8138}
8139
8140/// CodeRabbit-flagged tail rep-only case: the previous outer-loop
8141/// guard `pos + ROW_MIN_MATCH_LEN <= current_len` (6) meant the last
8142/// 5-byte position was unreachable. The rep probe at `abs_pos + 1`
8143/// only needs 4 bytes of lookahead beyond the probe point, so the
8144/// guard was relaxed to `pos + GREEDY_MIN_LOOKAHEAD <= current_len`
8145/// (5). This test drives the slices separately and asserts a match
8146/// is emitted **from the second slice's parse pass**, so a future
8147/// regression that re-tightens the guard or breaks the cross-slice
8148/// repcode lookup fails the test instead of being masked by
8149/// first-slice matches.
8150#[test]
8151fn driver_level5_greedy_tail_rep_only_reachable() {
8152    // Period-4 first slice locks rep1 = 4 into `offset_hist` by the
8153    // time the parse reaches the slice tail. Second slice is exactly
8154    // 5 bytes ( = `GREEDY_MIN_LOOKAHEAD`) so the outer loop runs
8155    // **once** at `pos = 0`; the regular `row_candidate` requires 6
8156    // bytes from `abs_pos`, which is past the live history, so the
8157    // only viable hit is the `abs_pos + 1` rep probe. `second[0..]`
8158    // is shaped so the rep probe at `abs_pos + 1` finds a 4-byte
8159    // match at offset 4 (`second[1..5] == first[13..16] ++ second[0]
8160    // == "BCDA"`), and `extend_backwards_shared` then absorbs
8161    // `second[0]` into the match (extending one byte back into the
8162    // implicit anchor, no further because anchor itself is the
8163    // current `abs_pos`).
8164    let first: &[u8] = b"ABCDABCDABCDABCD"; // 16 bytes — strict period 4
8165    let second: &[u8] = b"ABCDA"; // 5 bytes — exact GREEDY_MIN_LOOKAHEAD
8166    let mut driver = MatchGeneratorDriver::new(16, 2);
8167    driver.reset(CompressionLevel::Level(5));
8168
8169    let mut first_space = driver.get_next_space();
8170    first_space[..first.len()].copy_from_slice(first);
8171    first_space.truncate(first.len());
8172    driver.commit_space(first_space);
8173    driver.start_matching(|_| {});
8174
8175    let mut second_space = driver.get_next_space();
8176    second_space[..second.len()].copy_from_slice(second);
8177    second_space.truncate(second.len());
8178    driver.commit_space(second_space);
8179
8180    let mut second_slice_triples = 0usize;
8181    driver.start_matching(|seq| {
8182        if matches!(seq, Sequence::Triple { .. }) {
8183            second_slice_triples += 1;
8184        }
8185    });
8186
8187    assert!(
8188        second_slice_triples >= 1,
8189        "tail rep-only position must produce a match in the second slice \
8190         (got {second_slice_triples} triples)",
8191    );
8192}
8193
8194#[test]
8195fn driver_level4_greedy_empty_input_emits_nothing() {
8196    // Empty input: no slices committed → no sequences emitted, no
8197    // panic. Exercises the `current_len == 0` early-return guard at
8198    // the top of `start_matching_greedy`.
8199    let mut driver = MatchGeneratorDriver::new(64, 2);
8200    driver.reset(CompressionLevel::Level(4));
8201    // Commit an empty space so the matcher has SOMETHING to start
8202    // matching on (otherwise `start_matching` panics on the
8203    // `window.back()` unwrap — that's a separate path covered by
8204    // existing reset tests).
8205    let mut space = driver.get_next_space();
8206    space.truncate(0);
8207    driver.commit_space(space);
8208    let mut emitted_anything = false;
8209    driver.start_matching(|_| emitted_anything = true);
8210    assert!(!emitted_anything, "empty slice must not emit any sequences",);
8211}
8212
8213#[test]
8214fn driver_level4_greedy_sub_min_lookahead_input() {
8215    // Input shorter than `GREEDY_MIN_LOOKAHEAD = 5` — the outer loop
8216    // never executes a body iteration; the tail literal path must
8217    // still emit the input bytes as a single `Sequence::Literals`.
8218    let data: &[u8] = b"abcd"; // 4 bytes
8219    let (triples, _) = l4_greedy_round_trip(64, 2, data);
8220    assert_eq!(
8221        triples, 0,
8222        "sub-min-lookahead input must not emit any matches (got {triples})",
8223    );
8224}
8225
8226#[test]
8227fn driver_level4_greedy_incompressible_input() {
8228    // Pseudo-random bytes with no exploitable structure — every
8229    // position is a "miss" in both the rep probe and the row
8230    // candidate. Exercises the miss branch + `SKIP_STRENGTH = 10`
8231    // skip-step grow (irrelevant at this size, but the path runs).
8232    let mut data = alloc::vec::Vec::with_capacity(256);
8233    let mut x: u32 = 0xDEAD_BEEF;
8234    for _ in 0..256 {
8235        x = x.wrapping_mul(1_103_515_245).wrapping_add(12345);
8236        data.push((x >> 16) as u8);
8237    }
8238    let (_triples, _) = l4_greedy_round_trip(64, 8, &data);
8239    // No structural assertion — the test passes if round-trip is
8240    // bit-exact and no panic / debug_assert fires.
8241}
8242
8243#[test]
8244fn driver_level4_greedy_long_literal_run_skip_step_growth() {
8245    // 2 KiB of unstructured bytes drives the literal-run length past
8246    // the `SKIP_STRENGTH = 10` threshold (~1 KiB), so the miss branch
8247    // + per-miss step-grow path in `start_matching_greedy` is
8248    // exercised. This test is a stress smoke — it only asserts
8249    // bit-exact round-trip + no panic / `debug_assert!` fires; it
8250    // does NOT pin the `SKIP_STRENGTH` constant or the per-iteration
8251    // step count (round-trip would still pass on `SKIP_STRENGTH = 6`
8252    // or `= 14` since both produce valid sequences). Pinning the
8253    // exact step growth would require returning step / iteration
8254    // metadata from the parse, which is invasive plumbing for a
8255    // constant that hasn't been re-tuned in months. The value of
8256    // this test is catching panics or correctness regressions on
8257    // long incompressible runs, which is what its existing
8258    // round-trip assertion checks.
8259    let mut data = alloc::vec::Vec::with_capacity(2048);
8260    let mut x: u32 = 0xC0FF_EE00;
8261    for _ in 0..2048 {
8262        x = x.wrapping_mul(0x9E37_79B9).wrapping_add(0xCAFEBABE);
8263        data.push((x >> 24) as u8);
8264    }
8265    let (_triples, _) = l4_greedy_round_trip(512, 8, &data);
8266}
8267
8268#[test]
8269fn driver_level4_greedy_all_zeros_heavy_rep1() {
8270    // All zeros: every position after the first byte has `byte[pos]
8271    // == byte[pos - 1]`, so the rep1 probe at `abs_pos + 1` hits
8272    // immediately and the parse collapses to a single long match.
8273    // Exercises the `cheap rep at +1, full-match length` path.
8274    let data: Vec<u8> = alloc::vec![0u8; 128];
8275    let (triples, max_offset) = l4_greedy_round_trip(64, 8, &data);
8276    assert!(
8277        triples >= 1,
8278        "all-zeros input must produce at least one rep1 match",
8279    );
8280    // The dominant match should reference rep1 (offset 1), since
8281    // every byte at pos matches pos-1. A larger offset would
8282    // indicate the rep1 probe was bypassed.
8283    assert_eq!(
8284        max_offset, 1,
8285        "all-zeros L4 greedy parse should commit at offset 1 (got {max_offset})",
8286    );
8287}
8288
8289/// Periodic-pattern payload covers the steady-state rep-cascade path
8290/// of the greedy parse — the main-loop rep probe at `abs_pos + 1`
8291/// fires every iteration once the period is locked into
8292/// `offset_hist[0]`, and the parse emits a long chain of triples at
8293/// the same offset.
8294#[test]
8295fn driver_level4_greedy_periodic_pattern_rep_cascade() {
8296    let unit: &[u8] = b"alpha_beta_gamma";
8297    assert_eq!(unit.len(), 16);
8298    let mut data: Vec<u8> = Vec::with_capacity(unit.len() * 32);
8299    for _ in 0..32 {
8300        data.extend_from_slice(unit);
8301    }
8302    let (triples, max_offset) = l4_greedy_round_trip(64, 16, &data);
8303    assert!(
8304        triples >= 1,
8305        "periodic 16-byte payload must emit matches (got {triples})",
8306    );
8307    assert!(
8308        max_offset >= 16,
8309        "periodic 16-byte payload must produce at least one offset >= 16 \
8310         (got max_offset = {max_offset})",
8311    );
8312}
8313
8314#[test]
8315fn driver_reset_keeps_strategy_tag_in_sync_with_active_backend() {
8316    use super::strategy::StrategyTag;
8317
8318    fn check(level: CompressionLevel, expected: StrategyTag) {
8319        let mut driver = MatchGeneratorDriver::new(32, 2);
8320        driver.reset(level);
8321        assert_eq!(
8322            driver.strategy_tag, expected,
8323            "strategy_tag wrong for {level:?}"
8324        );
8325        assert_eq!(
8326            driver.strategy_tag.backend(),
8327            driver.active_backend(),
8328            "strategy_tag backend disagrees with active_backend for {level:?}"
8329        );
8330    }
8331
8332    check(CompressionLevel::Level(1), StrategyTag::Fast);
8333    check(CompressionLevel::Level(2), StrategyTag::Fast);
8334    check(CompressionLevel::Level(3), StrategyTag::Dfast);
8335    check(CompressionLevel::Level(4), StrategyTag::Dfast);
8336    check(CompressionLevel::Level(5), StrategyTag::Greedy);
8337    check(CompressionLevel::Level(7), StrategyTag::Lazy);
8338    check(CompressionLevel::Level(12), StrategyTag::Lazy);
8339    check(CompressionLevel::Level(13), StrategyTag::Btlazy2);
8340    check(CompressionLevel::Level(14), StrategyTag::Btlazy2);
8341    check(CompressionLevel::Level(15), StrategyTag::Btlazy2);
8342    check(CompressionLevel::Level(16), StrategyTag::BtOpt);
8343    check(CompressionLevel::Level(18), StrategyTag::BtUltra);
8344    check(CompressionLevel::Level(22), StrategyTag::BtUltra2);
8345    check(CompressionLevel::Fastest, StrategyTag::Fast);
8346    check(CompressionLevel::Default, StrategyTag::Dfast);
8347    check(CompressionLevel::Better, StrategyTag::Lazy);
8348    // `Best` sits on level 13 (the first dominant point of the deep band).
8349    check(CompressionLevel::Best, StrategyTag::Btlazy2);
8350}
8351
8352#[test]
8353fn level_16_17_map_to_btopt_strategy() {
8354    use super::strategy::{BackendTag, StrategyTag};
8355    let p16 = resolve_level_params(CompressionLevel::Level(16), None);
8356    let p17 = resolve_level_params(CompressionLevel::Level(17), None);
8357    assert_eq!(p16.backend(), BackendTag::HashChain);
8358    assert_eq!(p17.backend(), BackendTag::HashChain);
8359    assert_eq!(StrategyTag::for_level(16), StrategyTag::BtOpt);
8360    assert_eq!(StrategyTag::for_level(17), StrategyTag::BtOpt);
8361}
8362
8363#[test]
8364fn level_18_maps_to_btultra_level_19_to_btultra2_strategy() {
8365    use super::strategy::{BackendTag, StrategyTag};
8366    // Upstream zstd `clevels.h` (srcSize > 256 KiB tier): level 18 = `ZSTD_btultra`,
8367    // level 19 = `ZSTD_btultra2`. Level 19 was previously mapped to plain
8368    // btultra, which under-searched (searchLog 6 vs 7) and lost ~3.7% ratio
8369    // on the repo corpus.
8370    let p18 = resolve_level_params(CompressionLevel::Level(18), None);
8371    let p19 = resolve_level_params(CompressionLevel::Level(19), None);
8372    assert_eq!(p18.backend(), BackendTag::HashChain);
8373    assert_eq!(p19.backend(), BackendTag::HashChain);
8374    assert_eq!(StrategyTag::for_level(18), StrategyTag::BtUltra);
8375    assert_eq!(StrategyTag::for_level(19), StrategyTag::BtUltra2);
8376}
8377
8378#[test]
8379fn level_20_22_map_to_btultra2_strategy() {
8380    use super::strategy::{BackendTag, StrategyTag};
8381    for level in 20..=22 {
8382        let params = resolve_level_params(CompressionLevel::Level(level), None);
8383        assert_eq!(params.backend(), BackendTag::HashChain);
8384        assert_eq!(StrategyTag::for_level(level as u8), StrategyTag::BtUltra2);
8385    }
8386}
8387
8388#[test]
8389fn level22_uses_target_length_and_large_input_tables() {
8390    let params = resolve_level_params(CompressionLevel::Level(22), None);
8391    assert_eq!(params.window_log, 27);
8392    let hc = params.hc.unwrap();
8393    assert_eq!(hc.hash_log, 25);
8394    assert_eq!(hc.chain_log, 27);
8395    assert_eq!(hc.search_depth, 1 << 9);
8396    assert_eq!(hc.target_len, 999);
8397}
8398
8399#[test]
8400fn bt_levels_16_to_21_pin_clevels_params() {
8401    // Pins the BT-level (window_log, hash_log, chain_log, search_depth,
8402    // target_len) tuples so the clevels.h alignment cannot silently drift.
8403    // Levels 16-20 mirror upstream `clevels.h` (srcSize > 256 KiB tier,
8404    // search_depth = 1 << searchLog); level 21 intentionally keeps a deeper
8405    // search_depth (512 vs upstream's 128) — it beats C on ratio there and
8406    // the deeper walk is a deliberate ratio-positive divergence.
8407    let expected = [
8408        // (level, window_log, hash_log, chain_log, search_depth, target_len)
8409        (16u8, 22u8, 22usize, 22usize, 32usize, 48usize),
8410        (17, 23, 22, 23, 32, 64),
8411        (18, 23, 22, 23, 64, 64),
8412        (19, 23, 22, 24, 128, 256),
8413        (20, 25, 23, 25, 128, 256),
8414        (21, 26, 24, 24, 512, 256),
8415    ];
8416    for (level, wlog, hlog, clog, sd, tl) in expected {
8417        let p = resolve_level_params(CompressionLevel::Level(level as i32), None);
8418        assert_eq!(p.window_log, wlog, "level {level} window_log");
8419        let hc = p.hc.unwrap();
8420        assert_eq!(hc.hash_log, hlog, "level {level} hash_log");
8421        assert_eq!(hc.chain_log, clog, "level {level} chain_log");
8422        assert_eq!(hc.search_depth, sd, "level {level} search_depth");
8423        assert_eq!(hc.target_len, tl, "level {level} target_len");
8424    }
8425}
8426
8427#[test]
8428fn level22_source_size_hint_uses_btultra2_tiers() {
8429    let p16k = resolve_level_params(CompressionLevel::Level(22), Some(16 * 1024));
8430    assert_eq!(p16k.window_log, 14);
8431    let hc16k = p16k.hc.unwrap();
8432    assert_eq!(hc16k.hash_log, 15);
8433    assert_eq!(hc16k.chain_log, 15);
8434    assert_eq!(hc16k.search_depth, 1 << 10);
8435    assert_eq!(hc16k.target_len, 999);
8436
8437    let p128k = resolve_level_params(CompressionLevel::Level(22), Some(128 * 1024));
8438    assert_eq!(p128k.window_log, 17);
8439    let hc128k = p128k.hc.unwrap();
8440    assert_eq!(hc128k.hash_log, 17);
8441    assert_eq!(hc128k.chain_log, 18);
8442    assert_eq!(hc128k.search_depth, 1 << 11);
8443    assert_eq!(hc128k.target_len, 999);
8444
8445    let p256k = resolve_level_params(CompressionLevel::Level(22), Some(256 * 1024));
8446    assert_eq!(p256k.window_log, 18);
8447    let hc256k = p256k.hc.unwrap();
8448    assert_eq!(hc256k.hash_log, 19);
8449    assert_eq!(hc256k.chain_log, 19);
8450    assert_eq!(hc256k.search_depth, 1 << 13);
8451    assert_eq!(hc256k.target_len, 999);
8452}
8453
8454#[test]
8455fn level22_non_power_of_two_small_source_uses_tier3_params() {
8456    // srcSize 15 027 (<= 16 KB) selects the table[3] btultra2 row; the
8457    // source-size clamp gives windowLog 14 (ceil log2 15027). Pure-Rust
8458    // assertion against the constant tier-3 geometry (no FFI).
8459    let source_size = 15_027u64;
8460    let params = resolve_level_params(CompressionLevel::Level(22), Some(source_size));
8461
8462    let hc = params.hc.unwrap();
8463    assert_eq!(params.window_log, 14);
8464    assert_eq!(hc.chain_log, 15);
8465    assert_eq!(hc.hash_log, 15);
8466    assert_eq!(hc.search_depth, 1 << 10);
8467    assert_eq!(HC_OPT_MIN_MATCH_LEN, 3);
8468    assert_eq!(hc.target_len, 999);
8469}
8470
8471#[test]
8472fn level22_small_source_uses_window_bounded_hash3_log() {
8473    let mut hc = HcMatchGenerator::new(1 << 14);
8474    hc.configure(
8475        BTULTRA2_HC_CONFIG_L22_16K,
8476        super::strategy::StrategyTag::BtUltra2,
8477        14,
8478    );
8479    assert_eq!(hc.table.hash3_log, 14);
8480
8481    hc.configure(
8482        BTULTRA2_HC_CONFIG_L22,
8483        super::strategy::StrategyTag::BtUltra2,
8484        27,
8485    );
8486    assert_eq!(hc.table.hash3_log, HC3_HASH_LOG);
8487}
8488
8489#[test]
8490fn btultra2_seed_pass_initializes_opt_state() {
8491    let mut hc = HcMatchGenerator::new(1 << 20);
8492    hc.configure(
8493        BTULTRA2_HC_CONFIG,
8494        super::strategy::StrategyTag::BtUltra2,
8495        26,
8496    );
8497    let data: Vec<u8> = (0..32 * 1024).map(|i| (i % 251) as u8).collect();
8498    hc.table.add_data(data, |_| {});
8499    hc.start_matching(|_| {});
8500    assert!(
8501        hc.backend.bt_mut().opt_state.lit_length_sum > 0,
8502        "btultra2 first block should seed non-zero sequence statistics"
8503    );
8504    assert!(
8505        hc.backend.bt_mut().opt_state.off_code_sum > 0,
8506        "btultra2 first block should seed offset-code statistics"
8507    );
8508}
8509
8510#[test]
8511fn btultra2_profile_disables_small_offset_handicap() {
8512    // Pre-Phase-3 this test duplicated the profile build with
8513    // `pass2=false` and `pass2=true` since `for_mode` differentiated
8514    // them. With `const_for_strategy::<BtUltra2>()` there is only one
8515    // profile — the upstream zstd `opt2` pricing — so a single binding
8516    // captures the invariant the test is asserting.
8517    let profile = HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra2>();
8518    assert!(
8519        !profile.favor_small_offsets,
8520        "btultra2 should match upstream zstd opt2 offset pricing"
8521    );
8522    assert!(
8523        profile.accurate,
8524        "btultra2 should use upstream zstd opt2 accurate pricing"
8525    );
8526}
8527
8528#[test]
8529fn btultra_profile_keeps_search_depth_budget() {
8530    let p = HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra>();
8531    assert_eq!(
8532        p.max_chain_depth, 64,
8533        "btultra chain-depth budget must match clevels.h level 18 searchLog 6 (1 << 6 = 64)"
8534    );
8535}
8536
8537#[test]
8538fn btopt_profile_keeps_search_depth_budget() {
8539    let p = HcOptimalCostProfile::const_for_strategy::<super::strategy::BtOpt>();
8540    assert_eq!(
8541        p.max_chain_depth, 32,
8542        "btopt should not cap chain depth below upstream zstd btopt search budget"
8543    );
8544}
8545
8546#[test]
8547fn sufficient_match_len_is_clamped_by_target_len() {
8548    let mut hc = HcMatchGenerator::new(1 << 20);
8549    hc.configure(
8550        BTULTRA2_HC_CONFIG,
8551        super::strategy::StrategyTag::BtUltra2,
8552        26,
8553    );
8554    hc.hc.target_len = 13;
8555    let profile = HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra2>();
8556    assert_eq!(hc.hc.sufficient_match_len_for_pass(profile), 13);
8557}
8558
8559#[test]
8560fn opt_modes_use_target_len_as_sufficient_len() {
8561    use super::strategy;
8562    let mut hc = HcMatchGenerator::new(1 << 20);
8563    hc.hc.target_len = 57;
8564    let profiles = [
8565        HcOptimalCostProfile::const_for_strategy::<strategy::BtOpt>(),
8566        HcOptimalCostProfile::const_for_strategy::<strategy::BtUltra>(),
8567        HcOptimalCostProfile::const_for_strategy::<strategy::BtUltra2>(),
8568    ];
8569    for profile in profiles {
8570        assert_eq!(hc.hc.sufficient_match_len_for_pass(profile), 57);
8571    }
8572}
8573
8574#[test]
8575fn sufficient_match_len_is_capped_by_opt_num() {
8576    let mut hc = HcMatchGenerator::new(1 << 20);
8577    hc.hc.target_len = usize::MAX / 2;
8578    let profile = HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra2>();
8579    assert_eq!(hc.hc.sufficient_match_len_for_pass(profile), HC_OPT_NUM - 1);
8580}
8581
8582#[test]
8583#[allow(clippy::borrow_deref_ref)]
8584fn dictionary_entropy_seed_initializes_opt_state_from_tables() {
8585    let mut hc = HcMatchGenerator::new(1 << 20);
8586    hc.configure(
8587        BTULTRA2_HC_CONFIG,
8588        super::strategy::StrategyTag::BtUltra2,
8589        26,
8590    );
8591
8592    let huff = crate::huff0::huff0_encoder::HuffmanTable::build_from_data(
8593        b"aaabbbbccccddddeeeeefffffgggg",
8594    );
8595    let ll = crate::fse::fse_encoder::default_ll_table();
8596    let ml = crate::fse::fse_encoder::default_ml_table();
8597    let of = crate::fse::fse_encoder::default_of_table();
8598    hc.seed_dictionary_entropy(Some(&huff), Some(&*ll), Some(&*ml), Some(&*of));
8599
8600    hc.backend.bt_mut().opt_state.rescale_freqs(
8601        b"abcd",
8602        HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra2>(),
8603    );
8604
8605    let base_ll_freqs: [u32; HC_MAX_LL + 1] = [
8606        4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8607        1, 1, 1, 1, 1, 1,
8608    ];
8609
8610    assert_ne!(
8611        hc.backend.bt_mut().opt_state.lit_length_freq,
8612        base_ll_freqs,
8613        "dictionary entropy should override fallback LL bootstrap frequencies"
8614    );
8615    assert!(
8616        hc.backend
8617            .bt_mut()
8618            .opt_state
8619            .match_length_freq
8620            .iter()
8621            .any(|&v| v != 1),
8622        "dictionary entropy should seed non-uniform ML frequencies"
8623    );
8624    assert_ne!(
8625        hc.backend.bt_mut().opt_state.off_code_freq[0],
8626        6,
8627        "dictionary entropy should override fallback OF bootstrap frequencies"
8628    );
8629}
8630
8631#[test]
8632#[allow(clippy::borrow_deref_ref)]
8633fn dictionary_fse_seed_applies_without_huffman_seed() {
8634    let mut hc = HcMatchGenerator::new(1 << 20);
8635    hc.configure(
8636        BTULTRA2_HC_CONFIG,
8637        super::strategy::StrategyTag::BtUltra2,
8638        26,
8639    );
8640
8641    let ll = crate::fse::fse_encoder::default_ll_table();
8642    let ml = crate::fse::fse_encoder::default_ml_table();
8643    let of = crate::fse::fse_encoder::default_of_table();
8644    hc.seed_dictionary_entropy(None, Some(&*ll), Some(&*ml), Some(&*of));
8645    hc.backend.bt_mut().opt_state.rescale_freqs(
8646        b"abcd",
8647        HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra2>(),
8648    );
8649
8650    let base_ll_freqs: [u32; HC_MAX_LL + 1] = [
8651        4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8652        1, 1, 1, 1, 1, 1,
8653    ];
8654    assert_ne!(
8655        hc.backend.bt_mut().opt_state.lit_length_freq,
8656        base_ll_freqs,
8657        "FSE seed should still override LL bootstrap frequencies without huffman seed"
8658    );
8659    assert!(
8660        hc.backend
8661            .bt_mut()
8662            .opt_state
8663            .match_length_freq
8664            .iter()
8665            .any(|&v| v != 1),
8666        "FSE seed should still seed non-uniform ML frequencies"
8667    );
8668    assert_ne!(
8669        hc.backend.bt_mut().opt_state.off_code_freq[0],
8670        6,
8671        "FSE seed should still override OF bootstrap frequencies without huffman seed"
8672    );
8673}
8674
8675#[test]
8676#[allow(clippy::borrow_deref_ref)]
8677fn dictionary_seed_overrides_predef_price_mode_on_tiny_input() {
8678    let mut hc = HcMatchGenerator::new(1 << 20);
8679    hc.configure(
8680        BTULTRA2_HC_CONFIG,
8681        super::strategy::StrategyTag::BtUltra2,
8682        26,
8683    );
8684
8685    let ll = crate::fse::fse_encoder::default_ll_table();
8686    let ml = crate::fse::fse_encoder::default_ml_table();
8687    let of = crate::fse::fse_encoder::default_of_table();
8688    hc.seed_dictionary_entropy(None, Some(&*ll), Some(&*ml), Some(&*of));
8689    hc.backend.bt_mut().opt_state.rescale_freqs(
8690        b"abc",
8691        HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra2>(),
8692    );
8693    assert!(
8694        matches!(
8695            hc.backend.bt_mut().opt_state.price_type,
8696            HcOptPriceType::Dynamic
8697        ),
8698        "dictionary-seeded first block should stay in dynamic mode even for tiny src"
8699    );
8700}
8701
8702#[test]
8703fn lit_length_price_blocksize_max_costs_one_extra_bit() {
8704    let profile_predef = HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra2>();
8705    let mut stats_predef = HcOptState::new();
8706    stats_predef.price_type = HcOptPriceType::Predefined;
8707    let predef_max = profile_predef.lit_length_price(&stats_predef, HC_BLOCKSIZE_MAX);
8708    let predef_prev =
8709        profile_predef.lit_length_price(&stats_predef, HC_BLOCKSIZE_MAX.saturating_sub(1));
8710    assert_eq!(
8711        predef_max,
8712        predef_prev + HC_BITCOST_MULTIPLIER,
8713        "predefined litLength pricing at BLOCKSIZE_MAX must add exactly one bit"
8714    );
8715
8716    let profile_dyn = HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra2>();
8717    let mut stats_dyn = HcOptState::new();
8718    stats_dyn.price_type = HcOptPriceType::Dynamic;
8719    stats_dyn.lit_length_freq.fill(1);
8720    stats_dyn.lit_length_sum = (HC_MAX_LL + 1) as u32;
8721    stats_dyn.match_length_freq.fill(1);
8722    stats_dyn.match_length_sum = (HC_MAX_ML + 1) as u32;
8723    stats_dyn.off_code_freq.fill(1);
8724    stats_dyn.off_code_sum = (HC_MAX_OFF + 1) as u32;
8725    stats_dyn.lit_freq.fill(1);
8726    stats_dyn.lit_sum = (HC_MAX_LIT + 1) as u32;
8727    stats_dyn.set_base_prices(true);
8728    let dyn_max = profile_dyn.lit_length_price(&stats_dyn, HC_BLOCKSIZE_MAX);
8729    let dyn_prev = profile_dyn.lit_length_price(&stats_dyn, HC_BLOCKSIZE_MAX.saturating_sub(1));
8730    assert_eq!(
8731        dyn_max,
8732        dyn_prev + HC_BITCOST_MULTIPLIER,
8733        "dynamic litLength pricing at BLOCKSIZE_MAX must add exactly one bit"
8734    );
8735}
8736
8737#[test]
8738#[allow(clippy::borrow_deref_ref)]
8739fn btultra2_seed_pass_disabled_when_dictionary_entropy_seed_present() {
8740    let mut hc = HcMatchGenerator::new(1 << 20);
8741    hc.configure(
8742        BTULTRA2_HC_CONFIG,
8743        super::strategy::StrategyTag::BtUltra2,
8744        26,
8745    );
8746    let ll = crate::fse::fse_encoder::default_ll_table();
8747    let ml = crate::fse::fse_encoder::default_ml_table();
8748    let of = crate::fse::fse_encoder::default_of_table();
8749    hc.seed_dictionary_entropy(None, Some(&*ll), Some(&*ml), Some(&*of));
8750    assert!(
8751        !hc.should_run_btultra2_seed_pass::<super::strategy::BtUltra2>(HC_PREDEF_THRESHOLD + 1),
8752        "dictionary-seeded first block should skip btultra2 warmup pass"
8753    );
8754}
8755
8756#[test]
8757fn btultra2_seed_pass_disabled_when_prefix_history_exists() {
8758    let mut hc = HcMatchGenerator::new(1 << 20);
8759    hc.configure(
8760        BTULTRA2_HC_CONFIG,
8761        super::strategy::StrategyTag::BtUltra2,
8762        26,
8763    );
8764    hc.table.history_abs_start = 17;
8765    hc.table.push_test_chunk(b"abcdefghijklmnop".to_vec());
8766    assert!(
8767        !hc.should_run_btultra2_seed_pass::<super::strategy::BtUltra2>(HC_PREDEF_THRESHOLD + 9),
8768        "btultra2 warmup must be first-block only (no prefix history)"
8769    );
8770}
8771
8772#[test]
8773fn btultra2_seed_pass_disabled_for_tiny_block() {
8774    let mut hc = HcMatchGenerator::new(1 << 20);
8775    hc.configure(
8776        BTULTRA2_HC_CONFIG,
8777        super::strategy::StrategyTag::BtUltra2,
8778        26,
8779    );
8780    assert!(
8781        !hc.should_run_btultra2_seed_pass::<super::strategy::BtUltra2>(HC_PREDEF_THRESHOLD),
8782        "btultra2 warmup should not run at or below predefined threshold"
8783    );
8784}
8785
8786#[test]
8787fn btultra2_seed_pass_disabled_after_stats_initialized() {
8788    let mut hc = HcMatchGenerator::new(1 << 20);
8789    hc.configure(
8790        BTULTRA2_HC_CONFIG,
8791        super::strategy::StrategyTag::BtUltra2,
8792        26,
8793    );
8794    hc.backend.bt_mut().opt_state.lit_length_sum = 1;
8795    assert!(
8796        !hc.should_run_btultra2_seed_pass::<super::strategy::BtUltra2>(HC_PREDEF_THRESHOLD + 32),
8797        "btultra2 warmup should run only for first block before stats are initialized"
8798    );
8799}
8800
8801#[test]
8802fn btultra2_seed_pass_disabled_when_not_at_frame_start() {
8803    let mut hc = HcMatchGenerator::new(1 << 20);
8804    hc.configure(
8805        BTULTRA2_HC_CONFIG,
8806        super::strategy::StrategyTag::BtUltra2,
8807        26,
8808    );
8809    // Simulate non-first block state: current block has no prefix in deque,
8810    // but total produced window already includes prior output.
8811    hc.table.window_size = HC_PREDEF_THRESHOLD + 64;
8812    // window_size set manually above to simulate prior output; record the
8813    // current block as one live chunk (seed-pass check reads lengths, not bytes).
8814    hc.table.chunk_lens.push_back(HC_PREDEF_THRESHOLD + 32);
8815    assert!(
8816        !hc.should_run_btultra2_seed_pass::<super::strategy::BtUltra2>(HC_PREDEF_THRESHOLD + 32),
8817        "btultra2 warmup must not run after frame start"
8818    );
8819}
8820
8821#[test]
8822fn btultra2_seed_pass_disabled_when_ldm_sequences_exist() {
8823    let mut hc = HcMatchGenerator::new(1 << 20);
8824    hc.configure(
8825        BTULTRA2_HC_CONFIG,
8826        super::strategy::StrategyTag::BtUltra2,
8827        26,
8828    );
8829    hc.table.window_size = HC_PREDEF_THRESHOLD + 64;
8830    hc.table.chunk_lens.push_back(HC_PREDEF_THRESHOLD + 64);
8831    hc.backend.bt_mut().ldm_sequences.push(HcRawSeq {
8832        lit_length: 8,
8833        offset: 16,
8834        match_length: 32,
8835    });
8836    assert!(
8837        !hc.should_run_btultra2_seed_pass::<super::strategy::BtUltra2>(HC_PREDEF_THRESHOLD + 32),
8838        "btultra2 warmup must not run when LDM already produced sequences"
8839    );
8840}
8841
8842#[test]
8843fn literal_price_uses_eight_bits_when_literals_uncompressed() {
8844    let profile = HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra2>();
8845    let mut stats = HcOptState::new();
8846    stats.set_literals_compressed_for_tests(false);
8847    stats.price_type = HcOptPriceType::Predefined;
8848    assert_eq!(
8849        profile.literal_price(&stats, b'a'),
8850        8 * HC_BITCOST_MULTIPLIER,
8851        "uncompressed literals should cost 8 bits regardless of price mode"
8852    );
8853}
8854
8855#[test]
8856fn update_stats_skips_literal_frequencies_when_uncompressed() {
8857    let mut stats = HcOptState::new();
8858    stats.set_literals_compressed_for_tests(false);
8859    stats.update_stats(3, b"abc", 4, 8);
8860    assert_eq!(
8861        stats.lit_sum, 0,
8862        "literal sum must remain unchanged when literal compression is disabled"
8863    );
8864    assert_eq!(
8865        stats.lit_freq.iter().copied().sum::<u32>(),
8866        0,
8867        "literal frequencies must not be updated when literal compression is disabled"
8868    );
8869    assert_eq!(
8870        stats.lit_length_sum, 1,
8871        "literal-length stats still update for sequence modeling"
8872    );
8873    assert_eq!(
8874        stats.match_length_sum, 1,
8875        "match-length stats still update for sequence modeling"
8876    );
8877    assert_eq!(
8878        stats.off_code_sum, 1,
8879        "offset-code stats still update for sequence modeling"
8880    );
8881}
8882
8883#[test]
8884#[allow(clippy::borrow_deref_ref)]
8885fn dictionary_huffman_seed_ignored_when_literals_uncompressed() {
8886    let mut stats = HcOptState::new();
8887    stats.set_literals_compressed_for_tests(false);
8888    let huff = crate::huff0::huff0_encoder::HuffmanTable::build_from_data(
8889        b"aaaaabbbbcccddeeff00112233445566778899",
8890    );
8891    let ll = crate::fse::fse_encoder::default_ll_table();
8892    let ml = crate::fse::fse_encoder::default_ml_table();
8893    let of = crate::fse::fse_encoder::default_of_table();
8894    stats.seed_dictionary_entropy(Some(&huff), Some(&*ll), Some(&*ml), Some(&*of));
8895    stats.rescale_freqs(
8896        b"abcd",
8897        HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra2>(),
8898    );
8899    assert_eq!(
8900        stats.lit_sum, 0,
8901        "literal sum must stay zero when literals are uncompressed"
8902    );
8903    assert_eq!(
8904        stats.lit_freq.iter().copied().sum::<u32>(),
8905        0,
8906        "literal frequencies must ignore dictionary huffman seed when uncompressed"
8907    );
8908}
8909
8910#[test]
8911fn hc_repcode_candidates_respect_litlen_dependent_rep_order() {
8912    let mut hc = HcMatchGenerator::new(64);
8913    hc.table.history = b"xxxxxxABCDEFABCDEF".to_vec();
8914    hc.table.history_start = 0;
8915    hc.table.history_abs_start = 0;
8916
8917    let abs_pos = 12usize; // points at second "ABCDEF"
8918    let current_abs_end = hc.table.history.len();
8919    let reps = [6u32, 3u32, 9u32];
8920
8921    let mut lit_pos_candidates = Vec::new();
8922    hc.hc.for_each_repcode_candidate_with_reps(
8923        &hc.table,
8924        abs_pos,
8925        1,
8926        reps,
8927        current_abs_end,
8928        HC_OPT_MIN_MATCH_LEN,
8929        |c| {
8930            lit_pos_candidates.push(c.offset);
8931        },
8932    );
8933    assert!(
8934        lit_pos_candidates.contains(&6),
8935        "when lit_len>0, rep0 should be considered and match"
8936    );
8937
8938    let mut ll0_candidates = Vec::new();
8939    hc.hc.for_each_repcode_candidate_with_reps(
8940        &hc.table,
8941        abs_pos,
8942        0,
8943        reps,
8944        current_abs_end,
8945        HC_OPT_MIN_MATCH_LEN,
8946        |c| {
8947            ll0_candidates.push(c.offset);
8948        },
8949    );
8950    assert!(
8951        !ll0_candidates.contains(&6),
8952        "when lit_len==0, rep0 is not directly eligible (ll0 semantics)"
8953    );
8954}
8955
8956#[test]
8957fn hc_collect_optimal_candidates_keeps_reps_when_chain_depth_zero() {
8958    let mut hc = HcMatchGenerator::new(64);
8959    hc.hc.search_depth = 0;
8960    hc.table.history = b"xyzxyzxyzxyz".to_vec();
8961    hc.table.history_start = 0;
8962    hc.table.history_abs_start = 0;
8963
8964    let abs_pos = 6usize;
8965    let current_abs_end = hc.table.history.len();
8966    let profile = HcOptimalCostProfile {
8967        max_chain_depth: 0,
8968        sufficient_match_len: usize::MAX / 2,
8969        accurate: false,
8970        favor_small_offsets: false,
8971    };
8972    let mut out = Vec::new();
8973    hc.collect_optimal_candidates(
8974        abs_pos,
8975        current_abs_end,
8976        profile,
8977        HcCandidateQuery {
8978            reps: [3, 6, 9],
8979            lit_len: 1,
8980            ldm_candidate: None,
8981        },
8982        &mut out,
8983    );
8984    assert!(
8985        !out.is_empty(),
8986        "rep candidates should remain available even when chain depth is zero"
8987    );
8988    assert!(
8989        out.iter().any(|c| c.offset == 3),
8990        "rep0 candidate should be retained"
8991    );
8992}
8993
8994#[test]
8995fn hc_collect_optimal_candidates_rep_tail_match_skips_chain_probe() {
8996    let mut hc = HcMatchGenerator::new(64);
8997    hc.table.history = b"aaaaaaaaaa".to_vec();
8998    hc.table.history_start = 0;
8999    hc.table.history_abs_start = 0;
9000    hc.table.position_base = 0;
9001    hc.hc.search_depth = 32;
9002    let abs_pos = 6usize;
9003    hc.table.ensure_tables();
9004    hc.table.insert_positions(0, abs_pos);
9005
9006    let profile = HcOptimalCostProfile {
9007        max_chain_depth: 32,
9008        sufficient_match_len: usize::MAX / 2,
9009        accurate: true,
9010        favor_small_offsets: false,
9011    };
9012    let mut out = Vec::new();
9013    hc.collect_optimal_candidates(
9014        abs_pos,
9015        hc.table.history.len(),
9016        profile,
9017        HcCandidateQuery {
9018            reps: [1, 4, 8],
9019            lit_len: 1,
9020            ldm_candidate: None,
9021        },
9022        &mut out,
9023    );
9024
9025    assert!(
9026        out.iter()
9027            .all(|candidate| matches!(candidate.offset, 1 | 4)),
9028        "terminal rep match should return before chain probing adds non-rep offsets"
9029    );
9030}
9031
9032#[test]
9033fn hc_collect_optimal_candidates_long_chain_match_advances_skip_window() {
9034    let mut hc = HcMatchGenerator::new(128);
9035    hc.table.history = b"abcabcabcabcabcabcabcabc".to_vec();
9036    hc.table.history_start = 0;
9037    hc.table.history_abs_start = 0;
9038    hc.table.position_base = 0;
9039    hc.hc.search_depth = 32;
9040    let abs_pos = 9usize;
9041    hc.table.ensure_tables();
9042    hc.table.insert_positions(0, abs_pos);
9043    hc.table.skip_insert_until_abs = 0;
9044
9045    let profile = HcOptimalCostProfile {
9046        max_chain_depth: 32,
9047        sufficient_match_len: usize::MAX / 2,
9048        accurate: true,
9049        favor_small_offsets: false,
9050    };
9051    let mut out = Vec::new();
9052    hc.collect_optimal_candidates(
9053        abs_pos,
9054        hc.table.history.len(),
9055        profile,
9056        HcCandidateQuery {
9057            reps: [1, 4, 8],
9058            lit_len: 1,
9059            ldm_candidate: None,
9060        },
9061        &mut out,
9062    );
9063
9064    assert!(
9065        hc.table.skip_insert_until_abs > abs_pos,
9066        "long chain match should advance skip window to avoid redundant immediate insertions"
9067    );
9068}
9069
9070#[test]
9071fn hc_collect_optimal_candidates_chain_fast_skip_uses_match_end_minus_8() {
9072    let mut hc = HcMatchGenerator::new(128);
9073    hc.table.history = b"abcabcabcabcabcabcabcabc".to_vec();
9074    hc.table.history_start = 0;
9075    hc.table.history_abs_start = 0;
9076    hc.table.position_base = 0;
9077    hc.hc.search_depth = 32;
9078    let abs_pos = 9usize;
9079    hc.table.ensure_tables();
9080    hc.table.insert_positions(0, abs_pos);
9081    hc.table.skip_insert_until_abs = 0;
9082
9083    let profile = HcOptimalCostProfile {
9084        max_chain_depth: 32,
9085        sufficient_match_len: 10,
9086        accurate: true,
9087        favor_small_offsets: false,
9088    };
9089    let mut out = Vec::new();
9090    hc.collect_optimal_candidates(
9091        abs_pos,
9092        hc.table.history.len(),
9093        profile,
9094        HcCandidateQuery {
9095            reps: [1, 4, 8],
9096            lit_len: 1,
9097            ldm_candidate: None,
9098        },
9099        &mut out,
9100    );
9101
9102    let best_match_end = out
9103        .iter()
9104        .map(|candidate| candidate.start.saturating_add(candidate.match_len))
9105        .max()
9106        .expect("expected at least one candidate");
9107    assert!(
9108        hc.table.skip_insert_until_abs > abs_pos,
9109        "chain fast-skip must advance past current position"
9110    );
9111    assert!(
9112        hc.table.skip_insert_until_abs <= best_match_end.saturating_sub(8),
9113        "chain fast-skip must not exceed upstream zstd-style matchEndIdx - 8 bound"
9114    );
9115}
9116
9117#[test]
9118fn hc_collect_optimal_candidates_advances_skip_window_on_plain_bt_path() {
9119    let mut hc = HcMatchGenerator::new(256);
9120    hc.table.history = b"abcdefghijklmnop".to_vec();
9121    hc.table.history_start = 0;
9122    hc.table.history_abs_start = 0;
9123    hc.table.position_base = 0;
9124    hc.hc.search_depth = 0;
9125    hc.table.ensure_tables();
9126
9127    let abs_pos = 8usize;
9128    hc.table.skip_insert_until_abs = 0;
9129
9130    let profile = HcOptimalCostProfile {
9131        max_chain_depth: 0,
9132        sufficient_match_len: usize::MAX / 2,
9133        accurate: true,
9134        favor_small_offsets: false,
9135    };
9136    let mut out = Vec::new();
9137    hc.collect_optimal_candidates(
9138        abs_pos,
9139        hc.table.history.len(),
9140        profile,
9141        HcCandidateQuery {
9142            reps: [1, 4, 8],
9143            lit_len: 1,
9144            ldm_candidate: None,
9145        },
9146        &mut out,
9147    );
9148
9149    assert_eq!(
9150        hc.table.skip_insert_until_abs,
9151        abs_pos.saturating_add(1),
9152        "plain BT path should advance skip window by 1 via upstream zstd matchEndIdx baseline"
9153    );
9154}
9155
9156// Removed: the three `hc_collect_optimal_candidates_*_hash3_*` /
9157// `hc_hash3_tail_match_*` tests forced `search_depth = 0` together
9158// with `hash3_log != 0`, an HC-chain-walker-only fixture state that
9159// production never reaches (hash3 is BtUltra2-only and BtUltra2 always
9160// runs `search_depth = 512`). They depended on the `has_hash3 =>
9161// BtUltra2` escape hatch in the test dispatcher; with that hatch gone
9162// (CR review on PR #123) and the dispatcher routing purely from
9163// `self.strategy_tag`, there is no production-shaped configuration
9164// that reproduces what those tests asserted. The corresponding hash3
9165// invariants are exercised end-to-end by the existing level22 roundtrip
9166// + upstream zstd-parity ratio gate.
9167
9168#[test]
9169fn hc_ldm_candidates_are_merged_into_optimal_candidates() {
9170    let mut hc = HcMatchGenerator::new(512);
9171    hc.table.history = (0..256).map(|i| (i % 251) as u8).collect();
9172    hc.table.history_start = 0;
9173    hc.table.history_abs_start = 0;
9174
9175    let abs_pos = 128usize;
9176    let current_abs_end = 256usize;
9177    let ldm = MatchCandidate {
9178        start: abs_pos,
9179        offset: 96,
9180        match_len: 40,
9181    };
9182
9183    let profile = HcOptimalCostProfile {
9184        max_chain_depth: 0,
9185        sufficient_match_len: usize::MAX / 2,
9186        accurate: true,
9187        favor_small_offsets: false,
9188    };
9189    let mut out = Vec::new();
9190    hc.collect_optimal_candidates(
9191        abs_pos,
9192        current_abs_end,
9193        profile,
9194        HcCandidateQuery {
9195            reps: [1, 4, 8],
9196            lit_len: 1,
9197            ldm_candidate: Some(ldm),
9198        },
9199        &mut out,
9200    );
9201    assert!(
9202        out.iter().any(
9203            |candidate| candidate.offset == ldm.offset && candidate.match_len == ldm.match_len
9204        ),
9205        "LDM candidate should be present in optimal candidate set"
9206    );
9207}
9208
9209#[test]
9210fn btultra_and_btultra2_both_keep_dictionary_candidates() {
9211    // Routes the BtUltra2 / BtUltra fixture through the production
9212    // `configure()` path so derived state (`hash3_log`, `is_btultra2`,
9213    // `uses_bt`, `backend`) stays consistent — manually flipping the
9214    // strategy flags here used to leave `hash3_log` / `hash3_table` in
9215    // the previous mode's shape and trip the
9216    // `Strategy::USE_HASH3 ⇒ hash3_log != 0` debug invariant inside
9217    // `collect_optimal_candidates_initialized_body`.
9218    use super::strategy::StrategyTag;
9219
9220    let test_config = HcConfig {
9221        hash_log: 23,
9222        chain_log: 22,
9223        search_depth: 32,
9224        target_len: 256,
9225        search_mls: 4,
9226    };
9227    let window_log = 20u8;
9228
9229    let prepare_history = |hc: &mut HcMatchGenerator, abs_pos: usize| {
9230        hc.table.history = alloc::vec![0u8; 160];
9231        for i in 0..64 {
9232            hc.table.history[i] = b'a' + (i % 7) as u8;
9233        }
9234        for i in 64..160 {
9235            hc.table.history[i] = b'k' + (i % 5) as u8;
9236        }
9237        for i in 0..24 {
9238            hc.table.history[abs_pos + i] = hc.table.history[16 + i];
9239        }
9240        hc.table.history_start = 0;
9241        hc.table.history_abs_start = 0;
9242        hc.table.position_base = 0;
9243        hc.table.ensure_tables();
9244        hc.table.insert_positions(0, abs_pos);
9245        hc.table.dictionary_limit_abs = Some(64);
9246        hc.table.skip_insert_until_abs = 0;
9247    };
9248
9249    let profile = HcOptimalCostProfile {
9250        max_chain_depth: 32,
9251        sufficient_match_len: usize::MAX / 2,
9252        accurate: true,
9253        favor_small_offsets: false,
9254    };
9255    let abs_pos = 96usize;
9256    let mut out = Vec::new();
9257
9258    let mut hc = HcMatchGenerator::new(256);
9259    hc.configure(test_config, StrategyTag::BtUltra2, window_log);
9260    prepare_history(&mut hc, abs_pos);
9261    hc.collect_optimal_candidates(
9262        abs_pos,
9263        160,
9264        profile,
9265        HcCandidateQuery {
9266            reps: [1, 4, 8],
9267            lit_len: 1,
9268            ldm_candidate: None,
9269        },
9270        &mut out,
9271    );
9272    assert!(
9273        out.iter().any(|candidate| candidate.offset >= 32),
9274        "btultra2 should retain dictionary candidates on upstream zstd-parity path"
9275    );
9276
9277    let mut hc = HcMatchGenerator::new(256);
9278    hc.configure(test_config, StrategyTag::BtUltra, window_log);
9279    prepare_history(&mut hc, abs_pos);
9280    hc.collect_optimal_candidates(
9281        abs_pos,
9282        160,
9283        profile,
9284        HcCandidateQuery {
9285            reps: [1, 4, 8],
9286            lit_len: 1,
9287            ldm_candidate: None,
9288        },
9289        &mut out,
9290    );
9291    assert!(
9292        out.iter().any(|candidate| candidate.offset >= 32),
9293        "btultra should retain dictionary candidates"
9294    );
9295}
9296
9297#[test]
9298fn driver_small_source_hint_shrinks_dfast_hash_tables() {
9299    let mut driver = MatchGeneratorDriver::new(32, 2);
9300
9301    driver.reset(CompressionLevel::Level(3));
9302    let mut space = driver.get_next_space();
9303    space[..12].copy_from_slice(b"abcabcabcabc");
9304    space.truncate(12);
9305    driver.commit_space(space);
9306    driver.skip_matching_with_hint(None);
9307    // Upstream zstd-parity split sizes: long-hash = DFAST_HASH_BITS,
9308    // short-hash = DFAST_HASH_BITS - DFAST_SHORT_HASH_BITS_DELTA.
9309    let full_long = driver.dfast_matcher().long_hash.len();
9310    let full_short = driver.dfast_matcher().short_hash.len();
9311    assert_eq!(full_long, 1 << DFAST_HASH_BITS);
9312    assert_eq!(
9313        full_short,
9314        1 << (DFAST_HASH_BITS - DFAST_SHORT_HASH_BITS_DELTA)
9315    );
9316
9317    driver.set_source_size_hint(1024);
9318    driver.reset(CompressionLevel::Level(3));
9319    let mut space = driver.get_next_space();
9320    space[..12].copy_from_slice(b"xyzxyzxyzxyz");
9321    space.truncate(12);
9322    driver.commit_space(space);
9323    driver.skip_matching_with_hint(None);
9324    let hinted_long = driver.dfast_matcher().long_hash.len();
9325    let hinted_short = driver.dfast_matcher().short_hash.len();
9326
9327    // The wire `window_log` stays at its floor (decoder-interop), but the
9328    // internal dfast tables are sized from the RAW 1 KiB source, not the
9329    // floored window: `table_window = 1 << ceil_log2(1024) = 1 << 10`, so
9330    // both tables land at the `MIN_WINDOW_LOG` floor (the long table at
9331    // `dfast_hash_bits_for_window(1 << 10) = 10`, the short table one
9332    // `DFAST_SHORT_HASH_BITS_DELTA` step below but clamped back up to
9333    // `MIN_WINDOW_LOG`).
9334    assert_eq!(driver.window_size(), 1 << MIN_HINTED_WINDOW_LOG);
9335    assert_eq!(hinted_long, 1 << MIN_WINDOW_LOG);
9336    assert_eq!(hinted_short, 1 << MIN_WINDOW_LOG);
9337    assert!(
9338        hinted_long < full_long && hinted_short < full_short,
9339        "tiny source hint should reduce both dfast tables"
9340    );
9341}
9342
9343#[test]
9344fn driver_huge_source_hint_does_not_overflow_table_window_shift() {
9345    // Regression: the Dfast / Row table-window sizing in `reset` derives a
9346    // shift from `ceil_log2(hint)`. A hint >= 2^63 + 1 makes that shift 64,
9347    // and `1usize << 64` panics in debug / wraps to 0 in release before the
9348    // `.min(max_window_size)` cap can apply. A `u64::MAX` pledged source size
9349    // must size the table to the real window, never panic or wrap to zero.
9350    let mut driver = MatchGeneratorDriver::new(32, 2);
9351    driver.set_source_size_hint(u64::MAX);
9352    driver.reset(CompressionLevel::Level(3));
9353
9354    let mut space = driver.get_next_space();
9355    space[..12].copy_from_slice(b"abcabcabcabc");
9356    space.truncate(12);
9357    driver.commit_space(space);
9358    driver.skip_matching_with_hint(None);
9359
9360    assert!(
9361        driver.dfast_matcher().long_hash.len() >= 1 << MIN_WINDOW_LOG,
9362        "huge hint must size the dfast table from the real window, not wrap to zero"
9363    );
9364}
9365
9366#[test]
9367fn driver_huge_source_hint_with_dict_does_not_overflow_hc_reserve() {
9368    // Regression: the HC/BT history-mirror pre-size adds the dictionary
9369    // hint to the source-size hint before `reserve_history` clamps to the
9370    // window ceiling. A `u64::MAX` pledged source size (the "unknown size"
9371    // sentinel) plus any positive dictionary hint overflows `usize` in
9372    // `(src as usize) + dict_hint` — debug panic / release wrap on 64-bit,
9373    // and `src as usize` truncation on 32-bit targets. Level 16 (BtOpt)
9374    // routes through the HashChain/BT storage arm that owns this reserve.
9375    // Must size the mirror to the real window, never panic, wrap, or
9376    // truncate.
9377    let mut driver = MatchGeneratorDriver::new(32, 2);
9378    driver.set_source_size_hint(u64::MAX);
9379    driver.set_dictionary_size_hint(64 * 1024);
9380    driver.reset(CompressionLevel::Level(16));
9381
9382    // The saturated `usize::MAX` reserve target must be clamped to the HC
9383    // history ceiling, not reserved literally (which would OOM/panic). Level 16
9384    // has window_log 22, so the ceiling is `window + window/4 + one block`
9385    // (the `reserve_history` formula). Assert the reserve actually reached it —
9386    // a no-panic-only check would also pass on an under-reserved mirror.
9387    let window = 1usize << 22;
9388    let expected_history_ceiling = window + (window >> 2) + crate::common::MAX_BLOCK_SIZE as usize;
9389    assert!(
9390        driver.hc_matcher().table.history.capacity() >= expected_history_ceiling,
9391        "huge source + dict hint must reserve the clamped HC history ceiling, got {}",
9392        driver.hc_matcher().table.history.capacity()
9393    );
9394
9395    let mut space = driver.get_next_space();
9396    space[..12].copy_from_slice(b"abcabcabcabc");
9397    space.truncate(12);
9398    driver.commit_space(space);
9399    driver.skip_matching_with_hint(None);
9400}
9401
9402#[test]
9403fn driver_chain_log_override_survives_row_to_hc_fallback() {
9404    // Regression: when a RowHash level is forced onto the HashChain backend
9405    // (resolved window <= 14, upstream `ZSTD_resolveRowMatchFinderMode`), the
9406    // synthesised HC chain table must honour an explicit `chain_log` override.
9407    // The RowHash override arm drops `chain_log` (Row has no chain table), so
9408    // the synthesis previously replaced the caller's `chain_log` with the upstream zstd
9409    // `hashLog - 1`, silently ignoring it on small-window frames.
9410    let chain_log_override = 10u32;
9411    let ov = super::parameters::ParamOverrides {
9412        chain_log: Some(chain_log_override),
9413        ..Default::default()
9414    };
9415    let mut driver = MatchGeneratorDriver::new(32, 2);
9416    // Small source hint pins the window to the hinted floor (16 KiB =
9417    // windowLog 14), so the Level 6 Row finder falls back to HashChain.
9418    driver.set_source_size_hint(1 << 12);
9419    driver.set_param_overrides(Some(ov));
9420    driver.reset(CompressionLevel::Level(6));
9421    let mut space = driver.get_next_space();
9422    space[..12].copy_from_slice(b"abcabcabcabc");
9423    space.truncate(12);
9424    driver.commit_space(space);
9425    driver.skip_matching_with_hint(None);
9426    // The override (10) is below the window cap (14), so the resolved HC chain
9427    // table must reflect it — NOT the upstream zstd `hashLog - 1` (18, clamped to the
9428    // window 14). Pre-fix this resolved to 14.
9429    assert_eq!(
9430        driver.hc_matcher().table.chain_log,
9431        chain_log_override as usize,
9432        "explicit chain_log override must survive the Row->HC fallback, got {}",
9433        driver.hc_matcher().table.chain_log
9434    );
9435}
9436
9437#[test]
9438fn driver_small_source_hint_shrinks_row_hash_tables() {
9439    let mut driver = MatchGeneratorDriver::new(32, 2);
9440
9441    driver.reset(CompressionLevel::Level(5));
9442    let mut space = driver.get_next_space();
9443    space[..12].copy_from_slice(b"abcabcabcabc");
9444    space.truncate(12);
9445    driver.commit_space(space);
9446    driver.skip_matching_with_hint(None);
9447    let full_rows = driver.row_matcher().row_heads.len();
9448    // Level 5 uses the upstream row_log (clamp(searchLog=3, 4, 6) = 4) and the
9449    // upstream L5 hashLog (`ZSTD_getCParams(5,..).hashLog` = 19), so the row
9450    // count is 1 << (ROW_L5.hash_bits - ROW_L5.row_log).
9451    assert_eq!(full_rows, 1 << (ROW_L5.hash_bits - ROW_L5.row_log));
9452
9453    // A hint that keeps the resolved window > 14 STILL uses the Row finder
9454    // (upstream `ZSTD_resolveRowMatchFinderMode`: row mode on for windowLog > 14)
9455    // and shrinks the row hash table to the source-derived width. 64 KiB →
9456    // raw source log 16, so `row_hash_bits_for_window(1 << 16)` < the level's
9457    // full hash_bits (19) and the row count drops.
9458    driver.set_source_size_hint(1 << 16);
9459    driver.reset(CompressionLevel::Level(5));
9460    let mut space = driver.get_next_space();
9461    space[..12].copy_from_slice(b"xyzxyzxyzxyz");
9462    space.truncate(12);
9463    driver.commit_space(space);
9464    driver.skip_matching_with_hint(None);
9465    assert_eq!(
9466        driver.active_backend(),
9467        super::strategy::BackendTag::Row,
9468        "windowLog > 14 keeps the upstream row matchfinder"
9469    );
9470    let hinted_rows = driver.row_matcher().row_heads.len();
9471    assert!(
9472        hinted_rows < full_rows,
9473        "a window>14 source hint should reduce the row hash table footprint"
9474    );
9475
9476    // A tiny hint floors the resolved window at MIN_HINTED_WINDOW_LOG = 14;
9477    // upstream uses the HASH-CHAIN matcher (not Row) at windowLog <= 14, so the
9478    // driver must route greedy/lazy/lazy2 to the HashChain backend there.
9479    driver.set_source_size_hint(1024);
9480    driver.reset(CompressionLevel::Level(5));
9481    assert_eq!(driver.window_size(), 1 << MIN_HINTED_WINDOW_LOG);
9482    assert_eq!(
9483        driver.active_backend(),
9484        super::strategy::BackendTag::HashChain,
9485        "windowLog <= 14 must fall back to the upstream zstd hash-chain matchfinder",
9486    );
9487}
9488
9489#[test]
9490fn row_matches_roundtrip_multi_block_pattern() {
9491    let pattern = [7, 13, 44, 184, 19, 96, 171, 109, 141, 251];
9492    let first_block: Vec<u8> = pattern.iter().copied().cycle().take(128 * 1024).collect();
9493    let second_block: Vec<u8> = pattern.iter().copied().cycle().take(128 * 1024).collect();
9494
9495    let mut matcher = RowMatchGenerator::new(1 << 22);
9496    matcher.configure(ROW_CONFIG);
9497    matcher.ensure_tables();
9498    let replay_sequence = |decoded: &mut Vec<u8>, seq: Sequence<'_>| match seq {
9499        Sequence::Literals { literals } => decoded.extend_from_slice(literals),
9500        Sequence::Triple {
9501            literals,
9502            offset,
9503            match_len,
9504        } => {
9505            decoded.extend_from_slice(literals);
9506            let start = decoded.len() - offset;
9507            for i in 0..match_len {
9508                let byte = decoded[start + i];
9509                decoded.push(byte);
9510            }
9511        }
9512    };
9513
9514    matcher.add_data(first_block.clone(), |_| {});
9515    let mut history = Vec::new();
9516    matcher.start_matching(|seq| replay_sequence(&mut history, seq));
9517    assert_eq!(history, first_block);
9518
9519    matcher.add_data(second_block.clone(), |_| {});
9520    let prefix_len = history.len();
9521    matcher.start_matching(|seq| replay_sequence(&mut history, seq));
9522
9523    assert_eq!(&history[prefix_len..], second_block.as_slice());
9524
9525    // Force a literals-only pass so the Sequence::Literals arm is exercised.
9526    let third_block: Vec<u8> = (0u8..=255).collect();
9527    matcher.add_data(third_block.clone(), |_| {});
9528    let third_prefix = history.len();
9529    matcher.start_matching(|seq| replay_sequence(&mut history, seq));
9530    assert_eq!(&history[third_prefix..], third_block.as_slice());
9531}
9532
9533#[test]
9534fn row_short_block_emits_literals_only() {
9535    let mut matcher = RowMatchGenerator::new(1 << 22);
9536    matcher.configure(ROW_CONFIG);
9537
9538    matcher.add_data(b"abcde".to_vec(), |_| {});
9539
9540    let mut saw_triple = false;
9541    let mut reconstructed = Vec::new();
9542    matcher.start_matching(|seq| match seq {
9543        Sequence::Literals { literals } => reconstructed.extend_from_slice(literals),
9544        Sequence::Triple { .. } => saw_triple = true,
9545    });
9546
9547    assert!(
9548        !saw_triple,
9549        "row backend must not emit triples for short blocks"
9550    );
9551    assert_eq!(reconstructed, b"abcde");
9552
9553    // Then feed a clearly matchable block and ensure the Triple arm is reachable.
9554    saw_triple = false;
9555    matcher.add_data(b"abcdeabcde".to_vec(), |_| {});
9556    matcher.start_matching(|seq| {
9557        if let Sequence::Triple { .. } = seq {
9558            saw_triple = true;
9559        }
9560    });
9561    assert!(
9562        saw_triple,
9563        "row backend should emit triples on repeated data"
9564    );
9565}
9566
9567#[test]
9568fn row_pick_lazy_returns_best_when_lookahead_is_out_of_bounds() {
9569    let mut matcher = RowMatchGenerator::new(1 << 22);
9570    matcher.configure(ROW_CONFIG);
9571    matcher.add_data(b"abcabc".to_vec(), |_| {});
9572    // Build the row tables before probing: the lookahead path reaches
9573    // `row_candidate` -> `row_heads[..]` once the accept floor is small
9574    // enough to pass the length gate, so the tables must be allocated
9575    // (production always calls this before any candidate probe).
9576    matcher.ensure_tables();
9577
9578    let best = MatchCandidate {
9579        start: 0,
9580        offset: 1,
9581        match_len: ROW_MIN_MATCH_LEN,
9582    };
9583    let picked = matcher
9584        .pick_lazy_match(0, 0, Some(best))
9585        .expect("best candidate must survive");
9586
9587    assert_eq!(picked.start, best.start);
9588    assert_eq!(picked.offset, best.offset);
9589    assert_eq!(picked.match_len, best.match_len);
9590}
9591
9592#[test]
9593fn row_backfills_previous_block_tail_for_cross_boundary_match() {
9594    let mut matcher = RowMatchGenerator::new(1 << 22);
9595    matcher.configure(ROW_CONFIG);
9596
9597    let mut first_block = alloc::vec![0xA5; 64];
9598    first_block.extend_from_slice(b"XYZ");
9599    let second_block = b"XYZXYZtail".to_vec();
9600
9601    let replay_sequence = |decoded: &mut Vec<u8>, seq: Sequence<'_>| match seq {
9602        Sequence::Literals { literals } => decoded.extend_from_slice(literals),
9603        Sequence::Triple {
9604            literals,
9605            offset,
9606            match_len,
9607        } => {
9608            decoded.extend_from_slice(literals);
9609            let start = decoded.len() - offset;
9610            for i in 0..match_len {
9611                let byte = decoded[start + i];
9612                decoded.push(byte);
9613            }
9614        }
9615    };
9616
9617    matcher.add_data(first_block.clone(), |_| {});
9618    let mut reconstructed = Vec::new();
9619    matcher.start_matching(|seq| replay_sequence(&mut reconstructed, seq));
9620    assert_eq!(reconstructed, first_block);
9621
9622    matcher.add_data(second_block.clone(), |_| {});
9623    let mut saw_cross_boundary = false;
9624    let prefix_len = reconstructed.len();
9625    matcher.start_matching(|seq| {
9626        if let Sequence::Triple {
9627            literals,
9628            offset,
9629            match_len,
9630        } = seq
9631            && literals.is_empty()
9632            && offset == 3
9633            && match_len >= ROW_MIN_MATCH_LEN
9634        {
9635            saw_cross_boundary = true;
9636        }
9637        replay_sequence(&mut reconstructed, seq);
9638    });
9639
9640    assert!(
9641        saw_cross_boundary,
9642        "row matcher should reuse the 3-byte previous-block tail"
9643    );
9644    assert_eq!(&reconstructed[prefix_len..], second_block.as_slice());
9645}
9646
9647#[test]
9648fn row_skip_matching_with_incompressible_hint_uses_sparse_prefix() {
9649    let data = deterministic_high_entropy_bytes(0xA713_9C5D_44E2_10B1, 4096);
9650
9651    let mut dense = RowMatchGenerator::new(1 << 22);
9652    dense.configure(ROW_CONFIG);
9653    dense.add_data(data.clone(), |_| {});
9654    dense.skip_matching_with_hint(Some(false));
9655    let dense_slots = dense
9656        .row_positions
9657        .iter()
9658        .filter(|&&pos| pos != ROW_EMPTY_SLOT)
9659        .count();
9660
9661    let mut sparse = RowMatchGenerator::new(1 << 22);
9662    sparse.configure(ROW_CONFIG);
9663    sparse.add_data(data, |_| {});
9664    sparse.skip_matching_with_hint(Some(true));
9665    let sparse_slots = sparse
9666        .row_positions
9667        .iter()
9668        .filter(|&&pos| pos != ROW_EMPTY_SLOT)
9669        .count();
9670
9671    assert!(
9672        sparse_slots < dense_slots,
9673        "incompressible hint should seed fewer row slots (sparse={sparse_slots}, dense={dense_slots})"
9674    );
9675}
9676
9677/// Regression for the `None` arm of `skip_matching_with_hint`: the
9678/// row table must NOT receive dense inserts across the skipped range.
9679/// Upstream zstd parity (`ZSTD_row_fillHashCache` only pre-fills the next-scan
9680/// cache, not the skipped block's interior) trades cross-block
9681/// matches into the skipped interior for the per-block O(block_size)
9682/// insert cost.
9683///
9684/// At input < 1 block (4096 B with default 128 KiB block boundary),
9685/// the only positions in the row table after the call should be those
9686/// produced by the `backfill_start` lookback at the block's start
9687/// (≤ `ROW_HASH_KEY_LEN - 1` positions when block_start <
9688/// ROW_HASH_KEY_LEN). For `current_abs_start == 0`, even that backfill
9689/// is empty — so the table stays fully empty.
9690#[test]
9691fn row_skip_matching_with_none_hint_leaves_interior_empty() {
9692    let data = deterministic_high_entropy_bytes(0x9B47_F2A1_8C5E_3306, 4096);
9693
9694    let mut none_hint = RowMatchGenerator::new(1 << 22);
9695    none_hint.configure(ROW_CONFIG);
9696    none_hint.add_data(data.clone(), |_| {});
9697    none_hint.skip_matching_with_hint(None);
9698    let none_slots = none_hint
9699        .row_positions
9700        .iter()
9701        .filter(|&&pos| pos != ROW_EMPTY_SLOT)
9702        .count();
9703
9704    // Dense (Some(false), dict-priming path) for comparison — that
9705    // path inserts every position in the skipped range.
9706    let mut dense = RowMatchGenerator::new(1 << 22);
9707    dense.configure(ROW_CONFIG);
9708    dense.add_data(data, |_| {});
9709    dense.skip_matching_with_hint(Some(false));
9710    let dense_slots = dense
9711        .row_positions
9712        .iter()
9713        .filter(|&&pos| pos != ROW_EMPTY_SLOT)
9714        .count();
9715
9716    // Two assertions pin the contract:
9717    // 1) None hint is dramatically sparser than dense (the whole point).
9718    // 2) None hint at block-start==0 inserts ZERO positions (no
9719    //    backfill possible before position 0).
9720    assert_eq!(
9721        none_slots, 0,
9722        "None hint at block_start=0 must leave row table fully empty \
9723         (upstream zstd parity — interior NOT inserted, no pre-block backfill possible)",
9724    );
9725    assert!(
9726        dense_slots > 0,
9727        "Some(false) dict-priming path must still insert densely \
9728         (sanity check: control case for the `none_slots == 0` assertion)",
9729    );
9730}
9731
9732#[test]
9733fn driver_unhinted_level2_keeps_default_dfast_hash_table_size() {
9734    let mut driver = MatchGeneratorDriver::new(32, 2);
9735
9736    driver.reset(CompressionLevel::Level(3));
9737    let mut space = driver.get_next_space();
9738    space[..12].copy_from_slice(b"abcabcabcabc");
9739    space.truncate(12);
9740    driver.commit_space(space);
9741    driver.skip_matching_with_hint(None);
9742
9743    // Upstream zstd-parity split: long-hash at DFAST_HASH_BITS, short-hash one
9744    // bit smaller (DFAST_SHORT_HASH_BITS_DELTA = 1, matching upstream zstd
9745    // `chainLog = hashLog - 1` for dfast levels).
9746    let long_len = driver.dfast_matcher().long_hash.len();
9747    let short_len = driver.dfast_matcher().short_hash.len();
9748    assert_eq!(
9749        long_len,
9750        1 << DFAST_HASH_BITS,
9751        "unhinted Level(2) should keep default long-hash table size"
9752    );
9753    assert_eq!(
9754        short_len,
9755        1 << (DFAST_HASH_BITS - DFAST_SHORT_HASH_BITS_DELTA),
9756        "unhinted Level(2) short-hash should be one bit smaller than long-hash"
9757    );
9758}
9759
9760#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
9761#[test]
9762fn simple_backend_rejects_undersized_pooled_suffix_store() {
9763    let mut driver = MatchGeneratorDriver::new(128 * 1024, 2);
9764    driver.reset(CompressionLevel::Fastest);
9765
9766    driver.suffix_pool.push(SuffixStore::with_capacity(1024));
9767
9768    let mut space = driver.get_next_space();
9769    space.clear();
9770    space.resize(4096, 0xAB);
9771    driver.commit_space(space);
9772
9773    let last_suffix_slots = driver
9774        .simple()
9775        .window
9776        .last()
9777        .expect("window entry must exist after commit")
9778        .suffixes
9779        .slots
9780        .len();
9781    assert!(
9782        last_suffix_slots >= 4096,
9783        "undersized pooled suffix store must not be reused for larger blocks"
9784    );
9785}
9786
9787#[test]
9788fn source_hint_clamps_driver_slice_size_to_window() {
9789    let mut driver = MatchGeneratorDriver::new(128 * 1024, 2);
9790    driver.set_source_size_hint(1024);
9791    driver.reset(CompressionLevel::Default);
9792
9793    let window = driver.window_size() as usize;
9794    assert_eq!(window, 1 << MIN_HINTED_WINDOW_LOG);
9795    assert_eq!(driver.slice_size, window);
9796
9797    let space = driver.get_next_space();
9798    assert_eq!(space.len(), window);
9799    driver.commit_space(space);
9800}
9801
9802#[test]
9803fn pooled_space_keeps_capacity_when_slice_size_shrinks() {
9804    let mut driver = MatchGeneratorDriver::new(128 * 1024, 2);
9805    driver.reset(CompressionLevel::Default);
9806
9807    let large = driver.get_next_space();
9808    let large_capacity = large.capacity();
9809    assert!(large_capacity >= 128 * 1024);
9810    driver.commit_space(large);
9811
9812    driver.set_source_size_hint(1024);
9813    driver.reset(CompressionLevel::Default);
9814
9815    let small = driver.get_next_space();
9816    assert_eq!(small.len(), 1 << MIN_HINTED_WINDOW_LOG);
9817    assert!(
9818        small.capacity() >= large_capacity,
9819        "pooled buffer capacity should be preserved to avoid shrink/grow churn"
9820    );
9821}
9822
9823#[test]
9824fn driver_best_to_fastest_releases_oversized_hc_tables() {
9825    let mut driver = MatchGeneratorDriver::new(32, 2);
9826
9827    // Initialize at Best routed onto HashChain via the test-only override
9828    // (production `Best` sits on level 13, whose native backend differs) —
9829    // allocates large HC tables (4M hash, 2M chain) so the swap below
9830    // exercises the HC drain path this test pins.
9831    driver.reset_on_hc_lazy(CompressionLevel::Best);
9832    assert_eq!(driver.window_size(), (1u64 << 22));
9833
9834    // Feed data so tables are actually allocated via ensure_tables().
9835    let mut space = driver.get_next_space();
9836    space[..12].copy_from_slice(b"abcabcabcabc");
9837    space.truncate(12);
9838    driver.commit_space(space);
9839    driver.skip_matching_with_hint(None);
9840
9841    // Switch to Fastest — the [`MatcherStorage`] enum swaps to the
9842    // `Simple` variant and the `HashChain` variant is dropped. The
9843    // drain block in `Matcher::reset` reassigns
9844    // `m.table.hash_table` / `chain_table` / `hash3_table` to
9845    // `Vec::new()` BEFORE constructing the replacement variant so the
9846    // table backing allocations are released up front — this caps
9847    // peak memory during the swap to "old data buffers being drained
9848    // into `vec_pool` + new `MatchGenerator` skeleton" rather than
9849    // "old tables still resident + new variant under construction".
9850    // The eventual `Drop` on the old variant would release the tables
9851    // anyway, but only after the new variant is built, so the early
9852    // reassign shifts the peak. Post-switch the HC variant no longer
9853    // exists; the assertion that storage is now `Simple` covers the
9854    // invariant the old hash_table/chain_table checks were proxying.
9855    driver.reset(CompressionLevel::Fastest);
9856    assert_eq!(driver.window_size(), (1u64 << 19));
9857    assert_eq!(driver.active_backend(), super::strategy::BackendTag::Simple);
9858}
9859
9860#[test]
9861fn driver_better_to_best_resizes_hc_tables() {
9862    let mut driver = MatchGeneratorDriver::new(32, 2);
9863
9864    // The lazy band runs on the Row backend now, so the HC resize path is
9865    // exercised across two BT levels whose native `HcConfig` widths differ:
9866    // L13 (hash_log 22, chain_log 22) -> L15 (hash_log 23, chain_log 23).
9867    driver.reset(CompressionLevel::Level(13));
9868    assert_eq!(driver.window_size(), (1u64 << 22));
9869
9870    let mut space = driver.get_next_space();
9871    space[..12].copy_from_slice(b"abcabcabcabc");
9872    space.truncate(12);
9873    driver.commit_space(space);
9874    driver.skip_matching_with_hint(None);
9875
9876    let hc = driver.hc_matcher();
9877    let better_hash_len = hc.table.hash_table.len();
9878    let better_chain_len = hc.table.chain_table.len();
9879
9880    // Switch to L15 — must resize to larger tables.
9881    driver.reset(CompressionLevel::Level(15));
9882    assert_eq!(driver.window_size(), (1u64 << 22));
9883
9884    // Feed data to trigger ensure_tables with new sizes.
9885    let mut space = driver.get_next_space();
9886    space[..12].copy_from_slice(b"xyzxyzxyzxyz");
9887    space.truncate(12);
9888    driver.commit_space(space);
9889    driver.skip_matching_with_hint(None);
9890
9891    let hc = driver.hc_matcher();
9892    assert!(
9893        hc.table.hash_table.len() > better_hash_len,
9894        "L15 hash_table ({}) should be larger than L13 ({})",
9895        hc.table.hash_table.len(),
9896        better_hash_len
9897    );
9898    assert!(
9899        hc.table.chain_table.len() > better_chain_len,
9900        "L15 chain_table ({}) should be larger than L13 ({})",
9901        hc.table.chain_table.len(),
9902        better_chain_len
9903    );
9904}
9905
9906#[cfg(any())]
9907// disabled: tests legacy SuffixStore behavior incompatible with upstream zstd-shape kernel's HASH_READ_SIZE geometry
9908#[test]
9909fn prime_with_dictionary_preserves_history_for_first_full_block() {
9910    let mut driver = MatchGeneratorDriver::new(8, 1);
9911    driver.reset(CompressionLevel::Fastest);
9912
9913    driver.prime_with_dictionary(b"abcdefgh", [1, 4, 8]);
9914
9915    let mut space = driver.get_next_space();
9916    space.clear();
9917    space.extend_from_slice(b"abcdefgh");
9918    driver.commit_space(space);
9919
9920    let mut saw_match = false;
9921    driver.start_matching(|seq| {
9922        if let Sequence::Triple {
9923            literals,
9924            offset,
9925            match_len,
9926        } = seq
9927            && literals.is_empty()
9928            && offset == 8
9929            && match_len >= MIN_MATCH_LEN
9930        {
9931            saw_match = true;
9932        }
9933    });
9934
9935    assert!(
9936        saw_match,
9937        "first full block should still match dictionary-primed history"
9938    );
9939}
9940
9941#[cfg(any())]
9942// disabled: tests legacy SuffixStore behavior incompatible with upstream zstd-shape kernel's HASH_READ_SIZE geometry
9943#[test]
9944fn prime_with_large_dictionary_preserves_early_history_until_first_block() {
9945    let mut driver = MatchGeneratorDriver::new(8, 1);
9946    driver.reset(CompressionLevel::Fastest);
9947
9948    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
9949
9950    let mut space = driver.get_next_space();
9951    space.clear();
9952    space.extend_from_slice(b"abcdefgh");
9953    driver.commit_space(space);
9954
9955    let mut saw_match = false;
9956    driver.start_matching(|seq| {
9957        if let Sequence::Triple {
9958            literals,
9959            offset,
9960            match_len,
9961        } = seq
9962            && literals.is_empty()
9963            && offset == 24
9964            && match_len >= MIN_MATCH_LEN
9965        {
9966            saw_match = true;
9967        }
9968    });
9969
9970    assert!(
9971        saw_match,
9972        "dictionary bytes should remain addressable until frame output exceeds the live window"
9973    );
9974}
9975
9976#[test]
9977fn prime_with_dictionary_applies_offset_history_even_when_content_is_empty() {
9978    let mut driver = MatchGeneratorDriver::new(8, 1);
9979    driver.reset(CompressionLevel::Fastest);
9980
9981    driver.prime_with_dictionary(&[], [11, 7, 3]);
9982
9983    assert_eq!(driver.simple_mut().offset_hist, [11, 7, 3]);
9984}
9985
9986#[test]
9987fn hc_prime_with_empty_dictionary_disables_btultra2_seed_pass() {
9988    let mut driver = MatchGeneratorDriver::new(8, 1);
9989    driver.reset_on_hc_lazy(CompressionLevel::Better);
9990
9991    driver.prime_with_dictionary(&[], [11, 7, 3]);
9992
9993    assert_eq!(driver.hc_matcher().table.offset_hist, [11, 7, 3]);
9994    assert!(
9995        !driver
9996            .hc_matcher()
9997            .should_run_btultra2_seed_pass::<super::strategy::BtUltra2>(HC_PREDEF_THRESHOLD + 1),
9998        "btultra2 warmup must stay disabled after dictionary priming, even when dict content is empty"
9999    );
10000}
10001
10002#[test]
10003fn primed_snapshot_not_restored_across_ldm_config_change() {
10004    // The CDict-equivalent primed snapshot clones `storage`, which on the
10005    // BT backend carries `BtMatcher::ldm_producer`. A snapshot captured
10006    // under one LDM configuration must NOT be restored into a reset that
10007    // resolved a different LDM configuration (else the restored producer
10008    // is stale). `PrimedKey` must fold the LDM override into the key so
10009    // such a restore is refused and the caller re-primes.
10010    use super::parameters::CompressionParameters;
10011
10012    let dict = b"abcdefghabcdefghabcdefgh";
10013    let ldm_on = CompressionParameters::builder(CompressionLevel::Level(19))
10014        .enable_long_distance_matching(true)
10015        .build()
10016        .unwrap()
10017        .overrides();
10018    let ldm_off = CompressionParameters::builder(CompressionLevel::Level(19))
10019        .build()
10020        .unwrap()
10021        .overrides();
10022
10023    let mut driver = MatchGeneratorDriver::new(1024, 1);
10024
10025    // Capture a snapshot primed under LDM-on at level 19.
10026    driver.set_param_overrides(Some(ldm_on));
10027    driver.reset(CompressionLevel::Level(19));
10028    driver.prime_with_dictionary(dict, [1, 4, 8]);
10029    driver.capture_primed_dictionary(CompressionLevel::Level(19));
10030
10031    // Same dictionary + level, but LDM now OFF: the snapshot's LDM state
10032    // is stale, so restore must be refused.
10033    driver.set_param_overrides(Some(ldm_off));
10034    driver.reset(CompressionLevel::Level(19));
10035    assert!(
10036        !driver.restore_primed_dictionary(CompressionLevel::Level(19)),
10037        "primed snapshot restored across an LDM config change (stale producer)",
10038    );
10039
10040    // Sanity: re-priming + capturing under LDM-off, then restoring under
10041    // the IDENTICAL LDM-off config DOES match (the key is not over-tight).
10042    driver.prime_with_dictionary(dict, [1, 4, 8]);
10043    driver.capture_primed_dictionary(CompressionLevel::Level(19));
10044    driver.reset(CompressionLevel::Level(19));
10045    assert!(
10046        driver.restore_primed_dictionary(CompressionLevel::Level(19)),
10047        "primed snapshot not restored under identical LDM config",
10048    );
10049}
10050
10051#[test]
10052fn hc_prime_with_dictionary_disables_btultra2_seed_pass() {
10053    let mut driver = MatchGeneratorDriver::new(8, 1);
10054    driver.reset_on_hc_lazy(CompressionLevel::Better);
10055
10056    driver.prime_with_dictionary(b"abcdefgh", [1, 4, 8]);
10057
10058    assert!(
10059        !driver
10060            .hc_matcher()
10061            .should_run_btultra2_seed_pass::<super::strategy::BtUltra2>(HC_PREDEF_THRESHOLD + 1),
10062        "btultra2 warmup must stay disabled after dictionary priming with content"
10063    );
10064}
10065
10066#[test]
10067fn dfast_prime_with_dictionary_preserves_history_for_first_full_block() {
10068    let mut driver = MatchGeneratorDriver::new(8, 1);
10069    // Level(4) is Dfast with the greedy double-fast loop (upstream zstd parity:
10070    // clevels.h L3/L4 are both `ZSTD_dfast`, which has no lazy lookahead).
10071    // The fast loop needs at least `HASH_READ_SIZE` (8) bytes ahead of the
10072    // probe cursor, so this exercises a 16-byte dict + 16-byte block (the
10073    // whole block matches the dict, offset = dict length = 16).
10074    driver.reset(CompressionLevel::Level(4));
10075
10076    let payload = b"abcdefghijklmnop";
10077    driver.prime_with_dictionary(payload, [1, 4, 8]);
10078
10079    let mut space = driver.get_next_space();
10080    space.clear();
10081    space.extend_from_slice(payload);
10082    driver.commit_space(space);
10083
10084    let mut saw_match = false;
10085    driver.start_matching(|seq| {
10086        if let Sequence::Triple {
10087            literals,
10088            offset,
10089            match_len,
10090        } = seq
10091            && literals.is_empty()
10092            && offset == payload.len()
10093            && match_len >= DFAST_MIN_MATCH_LEN
10094        {
10095            saw_match = true;
10096        }
10097    });
10098
10099    assert!(
10100        saw_match,
10101        "dfast backend should match dictionary-primed history in first full block"
10102    );
10103}
10104
10105#[test]
10106fn prime_with_dictionary_does_not_inflate_reported_window_size() {
10107    let mut driver = MatchGeneratorDriver::new(8, 1);
10108    driver.reset(CompressionLevel::Fastest);
10109
10110    let before = driver.window_size();
10111    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
10112    let after = driver.window_size();
10113
10114    assert_eq!(
10115        after, before,
10116        "dictionary retention budget must not change reported frame window size"
10117    );
10118}
10119
10120#[test]
10121fn primed_snapshot_not_restored_when_window_hint_differs() {
10122    // The copy-snapshot must be keyed on the resolved reset parameters, not
10123    // just the CompressionLevel. `reset()` caps window_log by the source-size
10124    // hint, so two same-level frames with different hints resolve to different
10125    // windows. Restoring a snapshot captured at the larger hint into a reset
10126    // for the smaller hint would advertise the smaller window in the frame
10127    // header while the matcher's `max_window_size` (from the restored storage)
10128    // still spans the larger window — the encoder could then emit a match
10129    // (e.g. into the dictionary) past the advertised window, producing an
10130    // undecodable frame. Restore must REFUSE when the resolved window differs.
10131    let mut driver = MatchGeneratorDriver::new(8, 1);
10132    let level = CompressionLevel::Best;
10133
10134    // Frame A: large hint → larger resolved window. Prime + capture.
10135    driver.set_source_size_hint(256 * 1024);
10136    driver.reset(level);
10137    let big_window = driver.window_size();
10138    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
10139    driver.capture_primed_dictionary(level);
10140
10141    // Frame B: smaller hint, SAME level → smaller resolved window.
10142    driver.set_source_size_hint(48 * 1024);
10143    driver.reset(level);
10144    let small_window = driver.window_size();
10145    assert!(
10146        small_window < big_window,
10147        "precondition: the two hints must resolve to different windows \
10148         (small={small_window}, big={big_window})"
10149    );
10150
10151    let restored = driver.restore_primed_dictionary(level);
10152    assert!(
10153        !restored,
10154        "snapshot captured at window {big_window} must NOT be restored into a \
10155         reset advertising window {small_window} (level alone is an insufficient key)"
10156    );
10157}
10158
10159#[test]
10160fn primed_snapshot_restored_for_hints_in_same_window_bucket() {
10161    // The snapshot key must normalize the source-size hint to the resolved
10162    // matcher geometry, not the raw hinted byte count. `reset()` derives every
10163    // hint-dependent parameter (window_log cap, HC/Fast/Dfast/Row table widths,
10164    // the Fast attach-vs-copy cutoff) from `ceil_log2(hint)`, so two distinct
10165    // hints that share a ceil-log bucket resolve to the *identical* matcher
10166    // shape. Keying on the raw bytes over-keys: it forces a full re-prime on the
10167    // second frame even though the cached snapshot is a perfect fit. Restore
10168    // must SUCCEED across same-bucket hints.
10169    let mut driver = MatchGeneratorDriver::new(8, 1);
10170    let level = CompressionLevel::Best;
10171
10172    // Both hints fall in ceil_log2 bucket 19 (2^18 < n <= 2^19): 300 KiB and
10173    // 400 KiB resolve to the same window and table widths.
10174    driver.set_source_size_hint(300 * 1024);
10175    driver.reset(level);
10176    let window_a = driver.window_size();
10177    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
10178    driver.capture_primed_dictionary(level);
10179
10180    driver.set_source_size_hint(400 * 1024);
10181    driver.reset(level);
10182    let window_b = driver.window_size();
10183    assert_eq!(
10184        window_a, window_b,
10185        "precondition: same-bucket hints must resolve to the same window \
10186         (a={window_a}, b={window_b})"
10187    );
10188
10189    let restored = driver.restore_primed_dictionary(level);
10190    assert!(
10191        restored,
10192        "snapshot captured at a 300 KiB hint must be restored into a 400 KiB \
10193         hint that resolves to the identical matcher shape (raw bytes over-key)"
10194    );
10195}
10196
10197#[test]
10198fn primed_snapshot_restored_across_level22_tier_hints() {
10199    // Level 22 collapses several ceil-log buckets onto one upstream zstd source-size
10200    // tier: `resolve_level_params(Level(22), ..)` selects the HC config and
10201    // window_log by raw `<= 16 KiB / 128 KiB / 256 KiB` thresholds, so a 20 KiB
10202    // and a 100 KiB hint (ceil-log buckets 15 and 17) both land in the
10203    // `<= 128 KiB` tier and resolve to the IDENTICAL matcher (same window_log,
10204    // same HC hash/chain/search geometry). Keying on the raw ceil-log bucket
10205    // would still reject the restore here because the buckets differ; the key
10206    // must compare the resolved matcher shape so these share one snapshot.
10207    let mut driver = MatchGeneratorDriver::new(8, 1);
10208    let level = CompressionLevel::Level(22);
10209
10210    driver.set_source_size_hint(20 * 1024);
10211    driver.reset(level);
10212    let window_a = driver.window_size();
10213    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
10214    driver.capture_primed_dictionary(level);
10215
10216    driver.set_source_size_hint(100 * 1024);
10217    driver.reset(level);
10218    let window_b = driver.window_size();
10219    assert_eq!(
10220        window_a, window_b,
10221        "precondition: both hints must land in the same Level 22 upstream zstd tier \
10222         (a={window_a}, b={window_b})"
10223    );
10224
10225    let restored = driver.restore_primed_dictionary(level);
10226    assert!(
10227        restored,
10228        "Level 22 snapshot captured at a 20 KiB hint must be restored into a \
10229         100 KiB hint that resolves to the same upstream zstd tier (different ceil-log \
10230         buckets, identical matcher shape)"
10231    );
10232}
10233
10234#[test]
10235fn primed_snapshot_not_restored_across_fast_attach_copy_boundary() {
10236    // The Fast attach-vs-copy cutoff (8 KiB) falls INSIDE a single resolved
10237    // matcher shape: a 8192-byte and a 8193-byte hint both clamp Level 1 to
10238    // window_log 14 and the same Fast table widths, so `LevelParams` +
10239    // `table_bits` are identical, yet 8192 attaches (separate dict table) while
10240    // 8193 copies (dict primed into the live table). The snapshot key must
10241    // therefore carry the attach/copy mode itself; without it the two resets
10242    // would share a key and a copy-mode snapshot could be restored into an
10243    // attach-mode reset (a different `storage` shape). Restore must REFUSE
10244    // across the boundary.
10245    let mut driver = MatchGeneratorDriver::new(8, 1);
10246    let level = CompressionLevel::Level(1);
10247
10248    // Copy side (hint > 8 KiB): prime + capture.
10249    driver.set_source_size_hint(8193);
10250    driver.reset(level);
10251    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
10252    driver.capture_primed_dictionary(level);
10253
10254    // Attach side (hint <= 8 KiB), same resolved window/table shape.
10255    driver.set_source_size_hint(8192);
10256    driver.reset(level);
10257    let restored = driver.restore_primed_dictionary(level);
10258    assert!(
10259        !restored,
10260        "a copy-mode snapshot (8193 B hint) must NOT be restored into an \
10261         attach-mode reset (8192 B hint) that resolves to the same params but a \
10262         different dict-table shape"
10263    );
10264}
10265
10266#[test]
10267fn primed_snapshot_fast_attach_does_not_over_key_non_simple_backends() {
10268    // `fast_attach` is a Simple/Fast-backend concept (the 8 KiB attach-vs-copy
10269    // table split). Dfast/Row/HashChain each have their OWN attach/copy regime
10270    // (`DFAST_ATTACH_DICT_CUTOFF_LOG`, `ROW_ATTACH_DICT_CUTOFF_LOG`,
10271    // `HC_ATTACH_DICT_CUTOFF_LOG`) but those are deliberately kept OUT of the
10272    // `fast_attach` key, which only models the Fast table split. Their snapshots
10273    // are keyed by the resolved matcher geometry instead, and the HC modes share
10274    // one window geometry so an HC cross-mode restore stays decodable (see
10275    // `prime_with_dictionary`). Either way the `fast_attach`
10276    // bit must NOT enter a non-Simple snapshot key — otherwise an unhinted
10277    // capture (which would record `fast_attach = true`) and a hinted reset that
10278    // resolves to the IDENTICAL `LevelParams` would key differently and force a
10279    // needless re-prime. `Best` is a Row-backend lazy
10280    // level; this also pins the Row arm recording its RESOLVED hash width on
10281    // the unhinted path (a 0 default there keyed unhinted-vs-hinted apart).
10282    // An explicit Row-backend level: `Best` now sits on level 13 (Btlazy2),
10283    // so the named alias no longer reaches the Row arm this test pins.
10284    let mut driver = MatchGeneratorDriver::new(8, 1);
10285    let level = CompressionLevel::Level(12);
10286
10287    // Capture with no hint.
10288    driver.reset(level);
10289    let window_a = driver.window_size();
10290    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
10291    driver.capture_primed_dictionary(level);
10292
10293    // Reset with a hint large enough to resolve to the same window/params as
10294    // the unhinted level (>= 2^window_log, so the source-size cap is a no-op).
10295    driver.set_source_size_hint(64 * 1024 * 1024);
10296    driver.reset(level);
10297    let window_b = driver.window_size();
10298    assert_eq!(
10299        window_a, window_b,
10300        "precondition: the large hint must resolve to the same window as the \
10301         unhinted level (a={window_a}, b={window_b})"
10302    );
10303
10304    let restored = driver.restore_primed_dictionary(level);
10305    assert!(
10306        restored,
10307        "a Row snapshot must restore across an unhinted vs large-hinted \
10308         reset that resolves to the identical matcher — `fast_attach` is a Fast \
10309         backend concept and must not over-key non-Simple shapes"
10310    );
10311}
10312
10313#[cfg(any())] // disabled: tested SuffixStore-per-block tail-handling specific to legacy MatchGenerator
10314#[test]
10315fn prime_with_dictionary_does_not_reuse_tiny_suffix_store() {
10316    let mut driver = MatchGeneratorDriver::new(8, 2);
10317    driver.reset(CompressionLevel::Fastest);
10318
10319    // This dictionary leaves a 1-byte tail chunk (capacity=1 suffix table),
10320    // which should never be committed to the matcher window.
10321    driver.prime_with_dictionary(b"abcdefghi", [1, 4, 8]);
10322
10323    assert!(
10324        driver
10325            .simple()
10326            .window
10327            .iter()
10328            .all(|entry| entry.data.len() >= MIN_MATCH_LEN),
10329        "dictionary priming must not commit tails shorter than MIN_MATCH_LEN"
10330    );
10331}
10332
10333#[test]
10334fn prime_with_dictionary_counts_only_committed_tail_budget() {
10335    let mut driver = MatchGeneratorDriver::new(8, 1);
10336    driver.reset(CompressionLevel::Fastest);
10337
10338    let before = driver.simple_mut().max_window_size;
10339    // One full slice plus a 1-byte tail that cannot be committed.
10340    driver.prime_with_dictionary(b"abcdefghi", [1, 4, 8]);
10341
10342    assert_eq!(
10343        driver.simple_mut().max_window_size,
10344        before + 8,
10345        "retention budget must account only for dictionary bytes actually committed to history"
10346    );
10347}
10348
10349#[test]
10350fn dfast_prime_with_dictionary_counts_four_byte_tail_budget() {
10351    let mut driver = MatchGeneratorDriver::new(8, 1);
10352    driver.reset(CompressionLevel::Level(3));
10353
10354    let before = driver.dfast_matcher().max_window_size;
10355    // One full slice plus a 4-byte tail. Dfast can still use this tail through
10356    // short-hash overlap into the next block, so it should stay retained.
10357    driver.prime_with_dictionary(b"abcdefghijkl", [1, 4, 8]);
10358
10359    assert_eq!(
10360        driver.dfast_matcher().max_window_size,
10361        before + 12,
10362        "dfast retention budget should include 4-byte dictionary tails"
10363    );
10364}
10365
10366#[test]
10367fn row_prime_with_dictionary_preserves_history_for_first_full_block() {
10368    let mut driver = MatchGeneratorDriver::new(8, 1);
10369    // Level(5) is the greedy Row backend (LEVEL_TABLE row 5: Greedy / RowHash).
10370    // Level(4) now routes to Dfast, so this test must use Level(5) to actually
10371    // exercise `RowMatchGenerator`'s dictionary priming. The 16-byte dict +
10372    // 16-byte block lets the whole block match the primed dict (offset = dict
10373    // length = 16).
10374    driver.reset(CompressionLevel::Level(5));
10375
10376    let payload = b"abcdefghijklmnop";
10377    driver.prime_with_dictionary(payload, [1, 4, 8]);
10378
10379    let mut space = driver.get_next_space();
10380    space.clear();
10381    space.extend_from_slice(payload);
10382    driver.commit_space(space);
10383
10384    let mut saw_match = false;
10385    driver.start_matching(|seq| {
10386        if let Sequence::Triple {
10387            literals,
10388            offset,
10389            match_len,
10390        } = seq
10391            && literals.is_empty()
10392            && offset == payload.len()
10393            && match_len >= ROW_MIN_MATCH_LEN
10394        {
10395            saw_match = true;
10396        }
10397    });
10398
10399    assert!(
10400        saw_match,
10401        "row backend should match dictionary-primed history in first full block"
10402    );
10403}
10404
10405#[test]
10406fn row_prime_with_dictionary_subtracts_uncommitted_tail_budget() {
10407    let mut driver = MatchGeneratorDriver::new(8, 1);
10408    driver.reset(CompressionLevel::Level(5));
10409
10410    let base_window = driver.row_matcher().max_window_size;
10411    // Slice size is 8. The trailing byte cannot be committed (<4 tail),
10412    // so it must be subtracted from retained budget.
10413    driver.prime_with_dictionary(b"abcdefghi", [1, 4, 8]);
10414
10415    assert_eq!(
10416        driver.row_matcher().max_window_size,
10417        base_window + 8,
10418        "row retained window must exclude uncommitted 1-byte tail"
10419    );
10420}
10421
10422#[test]
10423fn prime_with_dictionary_budget_shrinks_after_row_eviction() {
10424    let mut driver = MatchGeneratorDriver::new(8, 1);
10425    driver.reset(CompressionLevel::Level(5));
10426    // Keep live window tiny so dictionary-primed slices are evicted quickly.
10427    driver.row_matcher_mut().max_window_size = 8;
10428    driver.reported_window_size = 8;
10429
10430    let base_window = driver.row_matcher().max_window_size;
10431    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
10432    assert_eq!(driver.row_matcher().max_window_size, base_window + 24);
10433
10434    for block in [b"AAAAAAAA", b"BBBBBBBB"] {
10435        let mut space = driver.get_next_space();
10436        space.clear();
10437        space.extend_from_slice(block);
10438        driver.commit_space(space);
10439        driver.skip_matching_with_hint(None);
10440    }
10441
10442    assert_eq!(
10443        driver.dictionary_retained_budget, 0,
10444        "dictionary budget should be fully retired once primed dict slices are evicted"
10445    );
10446    assert_eq!(
10447        driver.row_matcher().max_window_size,
10448        base_window,
10449        "retired dictionary budget must not remain reusable for live history"
10450    );
10451}
10452
10453/// Row → Simple transition drops the Row variant and the
10454/// post-switch active backend is exactly Simple. The window-emptied
10455/// check from the pre-enum era (`driver.row_matcher().window.is_empty()`)
10456/// is intentionally gone — the `Row` variant no longer exists after
10457/// the swap, so there is nothing to inspect by accessor; the "window
10458/// cleared" invariant is replaced by "variant dropped", and a
10459/// subsequent `row_matcher()` call would panic by design. The
10460/// pool-recycling side of the row backend is covered by
10461/// [`driver_row_commit_recycles_block_buffer_into_pool`].
10462#[test]
10463fn row_get_last_space_then_reset_to_fastest_drops_row_variant() {
10464    let mut driver = MatchGeneratorDriver::new(8, 1);
10465    driver.reset(CompressionLevel::Level(5));
10466    assert_eq!(driver.active_backend(), super::strategy::BackendTag::Row);
10467
10468    let mut space = driver.get_next_space();
10469    space.clear();
10470    space.extend_from_slice(b"row-data");
10471    driver.commit_space(space);
10472
10473    assert_eq!(driver.get_last_space(), b"row-data");
10474
10475    driver.reset(CompressionLevel::Fastest);
10476    assert_eq!(driver.active_backend(), super::strategy::BackendTag::Simple);
10477}
10478
10479/// Committing a Row block must return the input buffer to `vec_pool`
10480/// immediately (the bytes are mirrored into the contiguous `history`,
10481/// so there is no reason to retain a second copy in the window). This
10482/// guards the chunk-length window: the previous `VecDeque<Vec<u8>>`
10483/// window retained a full `block_capacity` buffer per committed block,
10484/// which on a heavily pre-split frame ballooned peak memory to many
10485/// times the live byte count. With the buffer recycled at commit time
10486/// the pool grows by exactly one Vec per committed block.
10487#[test]
10488fn driver_row_commit_recycles_block_buffer_into_pool() {
10489    let mut driver = MatchGeneratorDriver::new(8, 1);
10490    driver.reset(CompressionLevel::Level(5));
10491    assert_eq!(driver.active_backend(), super::strategy::BackendTag::Row);
10492
10493    let before_pool = driver.vec_pool.len();
10494    let mut space = driver.get_next_space();
10495    space.clear();
10496    space.extend_from_slice(b"row-data-to-recycle");
10497    driver.commit_space(space);
10498
10499    // `>` not `>=`: a fresh driver starts with `before_pool == 0`, so the
10500    // weaker bound passes even if the commit failed to recycle. Strict
10501    // growth proves the buffer was returned to the pool at commit time
10502    // rather than retained in the window (the pre-`chunk_lens` bug).
10503    assert!(
10504        driver.vec_pool.len() > before_pool,
10505        "row commit must recycle the committed block buffer into vec_pool \
10506         (before_pool = {before_pool}, after = {})",
10507        driver.vec_pool.len()
10508    );
10509    // The bytes still resolve through the contiguous history mirror.
10510    assert_eq!(driver.get_last_space(), b"row-data-to-recycle");
10511}
10512
10513#[test]
10514fn adjust_params_for_zero_source_size_uses_min_hinted_window_floor() {
10515    let mut params = resolve_level_params(CompressionLevel::Level(4), None);
10516    params.window_log = 22;
10517    let adjusted = adjust_params_for_source_size(params, 0);
10518    assert_eq!(adjusted.window_log, MIN_HINTED_WINDOW_LOG);
10519}
10520
10521#[test]
10522fn common_prefix_len_matches_scalar_reference_across_offsets() {
10523    fn scalar_reference(a: &[u8], b: &[u8]) -> usize {
10524        a.iter()
10525            .zip(b.iter())
10526            .take_while(|(lhs, rhs)| lhs == rhs)
10527            .count()
10528    }
10529
10530    for total_len in [
10531        0usize, 1, 5, 15, 16, 17, 31, 32, 33, 64, 65, 127, 191, 257, 320,
10532    ] {
10533        let base: Vec<u8> = (0..total_len)
10534            .map(|i| ((i * 13 + 7) & 0xFF) as u8)
10535            .collect();
10536
10537        for start in [0usize, 1, 3] {
10538            if start > total_len {
10539                continue;
10540            }
10541            let a = &base[start..];
10542            let b = a.to_vec();
10543            assert_eq!(
10544                common_prefix_len(a, &b),
10545                scalar_reference(a, &b),
10546                "equal slices total_len={total_len} start={start}"
10547            );
10548
10549            let len = a.len();
10550            for mismatch in [0usize, 1, 7, 15, 16, 31, 32, 47, 63, 95, 127, 128, 129, 191] {
10551                if mismatch >= len {
10552                    continue;
10553                }
10554                let mut altered = b.clone();
10555                altered[mismatch] ^= 0x5A;
10556                assert_eq!(
10557                    common_prefix_len(a, &altered),
10558                    scalar_reference(a, &altered),
10559                    "total_len={total_len} start={start} mismatch={mismatch}"
10560                );
10561            }
10562
10563            if len > 0 {
10564                let mismatch = len - 1;
10565                let mut altered = b.clone();
10566                altered[mismatch] ^= 0xA5;
10567                assert_eq!(
10568                    common_prefix_len(a, &altered),
10569                    scalar_reference(a, &altered),
10570                    "tail mismatch total_len={total_len} start={start} mismatch={mismatch}"
10571                );
10572            }
10573        }
10574    }
10575
10576    let long = alloc::vec![0xAB; 320];
10577    let shorter = alloc::vec![0xAB; 137];
10578    assert_eq!(
10579        common_prefix_len(&long, &shorter),
10580        scalar_reference(&long, &shorter)
10581    );
10582}
10583
10584#[test]
10585fn row_pick_lazy_returns_none_when_next_is_better() {
10586    let mut matcher = RowMatchGenerator::new(1 << 22);
10587    matcher.configure(ROW_CONFIG);
10588    matcher.add_data(alloc::vec![b'a'; 64], |_| {});
10589    matcher.ensure_tables();
10590
10591    let abs_pos = matcher.history_abs_start + 16;
10592    let best = MatchCandidate {
10593        start: abs_pos,
10594        offset: 8,
10595        match_len: ROW_MIN_MATCH_LEN,
10596    };
10597    assert!(
10598        matcher.pick_lazy_match(abs_pos, 0, Some(best)).is_none(),
10599        "lazy picker should defer when next position is clearly better"
10600    );
10601}
10602
10603#[test]
10604fn row_pick_lazy_depth2_returns_none_when_next2_significantly_better() {
10605    let mut matcher = RowMatchGenerator::new(1 << 22);
10606    matcher.configure(ROW_CONFIG);
10607    matcher.lazy_depth = 2;
10608    matcher.search_depth = 0;
10609    matcher.offset_hist = [6, 9, 1];
10610
10611    let mut data = alloc::vec![b'x'; 40];
10612    data[11..30].copy_from_slice(b"EFABCABCAEFABCAEFAB");
10613    matcher.add_data(data, |_| {});
10614    matcher.ensure_tables();
10615
10616    let abs_pos = matcher.history_abs_start + 20;
10617    let best = matcher
10618        .best_match(abs_pos, 0)
10619        .expect("expected baseline repcode match");
10620    assert_eq!(best.offset, 9);
10621    // Baseline match length is fixed by the fixture data (the offset-9
10622    // rep run is 6 bytes long), independent of the accept threshold.
10623    assert_eq!(best.match_len, 6);
10624
10625    if let Some(next) = matcher.best_match(abs_pos + 1, 1) {
10626        assert!(next.match_len <= best.match_len);
10627    }
10628
10629    let next2 = matcher
10630        .best_match(abs_pos + 2, 2)
10631        .expect("expected +2 candidate");
10632    assert!(
10633        next2.match_len > best.match_len + 1,
10634        "+2 candidate must be significantly better for depth-2 lazy skip"
10635    );
10636    assert!(
10637        matcher.pick_lazy_match(abs_pos, 0, Some(best)).is_none(),
10638        "lazy picker should defer when +2 candidate is significantly better"
10639    );
10640}
10641
10642#[test]
10643fn row_pick_lazy_depth2_keeps_best_when_next2_is_only_one_byte_better() {
10644    let mut matcher = RowMatchGenerator::new(1 << 22);
10645    matcher.configure(ROW_CONFIG);
10646    matcher.lazy_depth = 2;
10647    matcher.search_depth = 0;
10648    matcher.offset_hist = [6, 9, 1];
10649
10650    let mut data = alloc::vec![b'x'; 40];
10651    data[11..30].copy_from_slice(b"EFABCABCAEFABCAEFAZ");
10652    matcher.add_data(data, |_| {});
10653    matcher.ensure_tables();
10654
10655    let abs_pos = matcher.history_abs_start + 20;
10656    let best = matcher
10657        .best_match(abs_pos, 0)
10658        .expect("expected baseline repcode match");
10659    assert_eq!(best.offset, 9);
10660    // Baseline match length is fixed by the fixture data (the offset-9
10661    // rep run is 6 bytes long), independent of the accept threshold.
10662    assert_eq!(best.match_len, 6);
10663
10664    let next2 = matcher
10665        .best_match(abs_pos + 2, 2)
10666        .expect("expected +2 candidate");
10667    assert_eq!(next2.match_len, best.match_len + 1);
10668    let chosen = matcher
10669        .pick_lazy_match(abs_pos, 0, Some(best))
10670        .expect("lazy picker should keep current best");
10671    assert_eq!(chosen.start, best.start);
10672    assert_eq!(chosen.offset, best.offset);
10673    assert_eq!(chosen.match_len, best.match_len);
10674}
10675
10676/// Verifies row/tag extraction uses the shared hash mix bit-splitting contract.
10677#[test]
10678fn row_hash_and_row_extracts_high_bits() {
10679    let mut matcher = RowMatchGenerator::new(1 << 22);
10680    matcher.configure(ROW_CONFIG);
10681    matcher.add_data(
10682        alloc::vec![
10683            0xAA, 0xBB, 0xCC, 0x11, 0x10, 0x20, 0x30, 0x40, 0xAA, 0xBB, 0xCC, 0x22, 0x50, 0x60,
10684            0x70, 0x80,
10685        ],
10686        |_| {},
10687    );
10688    matcher.ensure_tables();
10689
10690    let pos = matcher.history_abs_start + 8;
10691    let (row, tag) = matcher
10692        .hash_and_row(pos)
10693        .expect("row hash should be available");
10694
10695    let idx = pos - matcher.history_abs_start;
10696    let concat = matcher.live_history();
10697    // Mirror `row_key_value`: an mls-wide masked key when 8 lookahead bytes
10698    // exist, the 4-byte key in the tail. `idx = 8` on a 16-byte history has
10699    // exactly 8 bytes left, so the wide arm applies here.
10700    let key_len = matcher.mls.min(6);
10701    let value = u64::from_le_bytes(concat[idx..idx + 8].try_into().unwrap())
10702        & ((1u64 << (key_len * 8)) - 1);
10703    let hash = crate::encoding::fastpath::hash_mix_u64_with_kernel(matcher.hash_kernel, value);
10704    let total_bits = matcher.row_hash_log + ROW_TAG_BITS;
10705    let combined = hash >> (u64::BITS as usize - total_bits);
10706    let expected_row =
10707        ((combined >> ROW_TAG_BITS) as usize) & ((1usize << matcher.row_hash_log) - 1);
10708    let expected_tag = combined as u8;
10709
10710    assert_eq!(row, expected_row);
10711    assert_eq!(tag, expected_tag);
10712}
10713
10714#[test]
10715fn row_repcode_skips_candidate_before_history_start() {
10716    let mut matcher = RowMatchGenerator::new(1 << 22);
10717    matcher.configure(ROW_CONFIG);
10718    matcher.history = alloc::vec![b'a'; 20];
10719    matcher.history_start = 0;
10720    matcher.history_abs_start = 10;
10721    matcher.offset_hist = [3, 0, 0];
10722
10723    assert!(matcher.repcode_candidate(12, 1).is_none());
10724}
10725
10726#[test]
10727fn row_repcode_returns_none_when_position_too_close_to_history_end() {
10728    let mut matcher = RowMatchGenerator::new(1 << 22);
10729    matcher.configure(ROW_CONFIG);
10730    matcher.history = b"abcde".to_vec();
10731    matcher.history_start = 0;
10732    matcher.history_abs_start = 0;
10733    matcher.offset_hist = [1, 0, 0];
10734
10735    assert!(matcher.repcode_candidate(4, 1).is_none());
10736}
10737
10738#[cfg(all(feature = "std", target_arch = "x86_64"))]
10739#[test]
10740fn hash_mix_sse42_path_is_available_and_matches_accelerated_impl_when_supported() {
10741    use crate::encoding::fastpath::{self, FastpathKernel};
10742    if !is_x86_feature_detected!("sse4.2") {
10743        return;
10744    }
10745    let v = 0x0123_4567_89AB_CDEFu64;
10746    // SAFETY: feature check above guarantees SSE4.2 is available.
10747    let accelerated = unsafe { fastpath::sse42::hash_mix_u64(v) };
10748    // Dispatcher must resolve to SSE4.2 (or better) and produce the same mix.
10749    let dispatched = fastpath::dispatch_hash_mix_u64(v);
10750    let kernel = fastpath::select_kernel();
10751    if kernel == FastpathKernel::Sse42 {
10752        assert_eq!(dispatched, accelerated);
10753    } else {
10754        // AVX2 kernel uses the same CRC32 instruction under the hood.
10755        assert_eq!(dispatched, accelerated, "AVX2/SSE4.2 share CRC32 mix");
10756    }
10757}
10758
10759#[cfg(all(feature = "std", target_arch = "aarch64", target_endian = "little"))]
10760#[test]
10761fn hash_mix_crc_path_is_available_and_matches_accelerated_impl_when_supported() {
10762    use crate::encoding::fastpath;
10763    if !is_aarch64_feature_detected!("crc") {
10764        return;
10765    }
10766    let v = 0x0123_4567_89AB_CDEFu64;
10767    // SAFETY: feature check above guarantees CRC32 is available.
10768    let accelerated = unsafe { fastpath::neon::hash_mix_u64(v) };
10769    let dispatched = fastpath::dispatch_hash_mix_u64(v);
10770    assert_eq!(dispatched, accelerated);
10771}
10772
10773#[test]
10774fn hc_hash3_position_matches_hash3_formula() {
10775    let bytes = [b'a', b'b', b'c', b'd'];
10776    let read32 = u32::from_le_bytes(bytes);
10777    let expected = (((read32 << 8).wrapping_mul(HC_PRIME3BYTES)) >> (32 - HC3_HASH_LOG)) as usize;
10778    assert_eq!(
10779        super::match_table::storage::MatchTable::hash3_position(&bytes, HC3_HASH_LOG),
10780        expected
10781    );
10782}
10783
10784#[test]
10785fn hc_hash_position_matches_hash4_formula() {
10786    let mut hc = HcMatchGenerator::new(1 << 20);
10787    hc.configure(HC_CONFIG, super::strategy::StrategyTag::Lazy, 22);
10788    let bytes = [b'a', b'b', b'c', b'd'];
10789    let read32 = u32::from_le_bytes(bytes);
10790    let expected = ((read32.wrapping_mul(HC_PRIME4BYTES)) >> (32 - hc.table.hash_log)) as usize;
10791    assert_eq!(hc.table.hash_position(&bytes), expected);
10792}
10793
10794#[test]
10795fn btultra2_main_hash_uses_hash4_formula() {
10796    let mut hc = HcMatchGenerator::new(1 << 20);
10797    hc.configure(
10798        BTULTRA2_HC_CONFIG_L22,
10799        super::strategy::StrategyTag::BtUltra2,
10800        27,
10801    );
10802    let bytes = [b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h'];
10803    let read32 = u32::from_le_bytes(bytes[..4].try_into().unwrap());
10804    let expected = ((read32.wrapping_mul(HC_PRIME4BYTES)) >> (32 - hc.table.hash_log)) as usize;
10805    let actual = super::match_table::storage::MatchTable::hash_position_with_mls(
10806        &bytes,
10807        hc.table.hash_log,
10808        super::bt::BtMatcher::HASH_MLS,
10809    );
10810    assert_eq!(actual, expected);
10811}
10812
10813#[test]
10814fn row_candidate_returns_none_when_abs_pos_near_end_of_history() {
10815    let mut matcher = RowMatchGenerator::new(1 << 22);
10816    matcher.configure(ROW_CONFIG);
10817    // One byte short of the accept floor: from abs_pos 0 there are fewer
10818    // than `ROW_MIN_MATCH_LEN` bytes left, so the length gate in
10819    // `row_candidate` must short-circuit to `None` before touching the
10820    // (here unbuilt) row tables.
10821    matcher.history = alloc::vec![b'a'; ROW_MIN_MATCH_LEN - 1];
10822    matcher.history_start = 0;
10823    matcher.history_abs_start = 0;
10824
10825    assert!(matcher.row_candidate(0, 0).is_none());
10826}
10827
10828#[test]
10829fn hc_chain_candidates_returns_sentinels_for_short_suffix() {
10830    let mut hc = HcMatchGenerator::new(32);
10831    hc.table.history = b"abc".to_vec();
10832    hc.table.history_start = 0;
10833    hc.table.history_abs_start = 0;
10834    hc.table.ensure_tables();
10835
10836    let candidates = hc.hc.chain_candidates(&hc.table, 0);
10837    assert!(candidates.iter().all(|&pos| pos == usize::MAX));
10838}
10839
10840#[test]
10841fn hc_reset_advances_floor_past_prior_frame_entries() {
10842    use super::match_table::storage::MatchTable;
10843    let mut hc = HcMatchGenerator::new(32);
10844    hc.table.add_data(b"abcdeabcde".to_vec(), |_| {});
10845    hc.table.ensure_tables();
10846    // Populate real hash / chain entries for the first frame's positions.
10847    hc.table.insert_positions(0, 6);
10848    let prev_end = hc.table.history_abs_end();
10849    assert_eq!(prev_end, 10);
10850    assert!(hc.table.hash_table.iter().any(|&v| v != HC_EMPTY));
10851
10852    hc.reset(|_| {});
10853
10854    // Behavioural contract: the previous frame's entries are no longer
10855    // matchable. `reset` advances the floor past every prior position
10856    // instead of zeroing the tables, so each populated slot now decodes
10857    // to an absolute position strictly below `history_abs_start` and is
10858    // rejected by the `window_low` guard before any byte is read.
10859    assert_eq!(hc.table.history_abs_start, prev_end);
10860    for &slot in hc.table.hash_table.iter() {
10861        if let Some(candidate_abs) =
10862            MatchTable::stored_abs_position_fast(slot, hc.table.position_base, hc.table.index_shift)
10863        {
10864            assert!(
10865                candidate_abs < hc.table.history_abs_start,
10866                "a prior-frame entry must resolve below the advanced floor"
10867            );
10868        }
10869    }
10870}
10871
10872#[test]
10873fn hc_reset_full_zeroes_when_floor_would_cross_ceiling() {
10874    use super::match_table::storage::REBASE_RESET_FLOOR_CEILING;
10875    let mut hc = HcMatchGenerator::new(32);
10876    hc.table.add_data(b"abcdeabcde".to_vec(), |_| {});
10877    hc.table.ensure_tables();
10878    hc.table.hash_table.fill(123);
10879    hc.table.chain_table.fill(456);
10880    // Push the would-be floor (`history_abs_end`) past the ceiling so
10881    // `reset` takes the bounded fallback: rewind to the origin and zero
10882    // the tables, keeping the absolute cursor from climbing toward
10883    // `usize::MAX` on 32-bit targets.
10884    hc.table.history_abs_start = REBASE_RESET_FLOOR_CEILING;
10885
10886    hc.reset(|_| {});
10887
10888    assert_eq!(hc.table.history_abs_start, 0);
10889    assert_eq!(hc.table.position_base, 0);
10890    assert!(hc.table.hash_table.iter().all(|&v| v == HC_EMPTY));
10891    assert!(hc.table.chain_table.iter().all(|&v| v == HC_EMPTY));
10892}
10893
10894#[test]
10895fn hc_start_matching_returns_early_for_empty_current_block() {
10896    let mut hc = HcMatchGenerator::new(32);
10897    hc.table.add_data(Vec::new(), |_| {});
10898    let mut called = false;
10899    hc.start_matching(|_| called = true);
10900    assert!(!called, "empty current block should not emit sequences");
10901}
10902
10903#[cfg(test)]
10904fn deterministic_high_entropy_bytes(seed: u64, len: usize) -> Vec<u8> {
10905    let mut out = Vec::with_capacity(len);
10906    let mut state = seed;
10907    for _ in 0..len {
10908        state ^= state << 13;
10909        state ^= state >> 7;
10910        state ^= state << 17;
10911        out.push((state >> 40) as u8);
10912    }
10913    out
10914}
10915
10916#[cfg(feature = "bench_internals")]
10917pub(crate) fn level22_block_ranges(data: &[u8]) -> Vec<(usize, usize)> {
10918    let mut ranges = Vec::new();
10919    let mut cursor = 0usize;
10920    let mut savings = 0i64;
10921    while cursor < data.len() {
10922        let remaining = data.len() - cursor;
10923        let candidate_len = remaining.min(super::cost_model::HC_BLOCKSIZE_MAX);
10924        let block_len = crate::encoding::frame_compressor::optimal_block_size(
10925            CompressionLevel::Level(22),
10926            &data[cursor..cursor + candidate_len],
10927            remaining,
10928            super::cost_model::HC_BLOCKSIZE_MAX,
10929            savings,
10930        )
10931        .min(candidate_len)
10932        .max(1);
10933        ranges.push((cursor, block_len));
10934        cursor += block_len;
10935        // The exact upstream zstd gate uses compressed-size savings. For this corpus
10936        // parity harness, after the first full block has compressed, savings is
10937        // sufficient to authorize the same pre-block splitter path.
10938        if cursor >= super::cost_model::HC_BLOCKSIZE_MAX {
10939            savings = 3;
10940        }
10941    }
10942    ranges
10943}
10944
10945#[cfg(feature = "bench_internals")]
10946fn merge_block_delimiters(sequences: Vec<(usize, usize, usize)>) -> Vec<(usize, usize, usize)> {
10947    let mut out = Vec::with_capacity(sequences.len());
10948    let mut pending_lits = 0usize;
10949    for (lit_len, offset, match_len) in sequences {
10950        if offset == 0 && match_len == 0 {
10951            pending_lits = pending_lits.saturating_add(lit_len);
10952            continue;
10953        }
10954        out.push((lit_len.saturating_add(pending_lits), offset, match_len));
10955        pending_lits = 0;
10956    }
10957    if pending_lits > 0 {
10958        out.push((pending_lits, 0, 0));
10959    }
10960    out
10961}
10962
10963/// White-box capture of the level-22 sequence stream (literal-length,
10964/// offset, match-length triples) the match generator emits for `data`,
10965/// with block-delimiter pseudo-sequences merged into the following
10966/// triple's literal run. Pure Rust; the C-conformance comparison that
10967/// consumes it lives in the `ffi-bench` crate.
10968#[cfg(feature = "bench_internals")]
10969pub(crate) fn collect_level22_sequences(data: &[u8]) -> Vec<(usize, usize, usize)> {
10970    merge_block_delimiters(collect_level22_sequences_with_delimiters(data))
10971        .into_iter()
10972        .filter(|(_, offset, match_len)| *offset != 0 || *match_len != 0)
10973        .collect()
10974}
10975
10976#[cfg(feature = "bench_internals")]
10977fn collect_level22_sequences_with_delimiters(data: &[u8]) -> Vec<(usize, usize, usize)> {
10978    let mut driver = MatchGeneratorDriver::new(super::cost_model::HC_BLOCKSIZE_MAX, 1);
10979    driver.set_source_size_hint(data.len() as u64);
10980    driver.reset(CompressionLevel::Level(22));
10981
10982    let mut sequences = Vec::new();
10983    for (chunk_start, chunk_len) in level22_block_ranges(data) {
10984        let chunk = &data[chunk_start..chunk_start + chunk_len];
10985        let mut space = driver.get_next_space();
10986        space[..chunk.len()].copy_from_slice(chunk);
10987        space.truncate(chunk.len());
10988        driver.commit_space(space);
10989        driver.start_matching(|seq| {
10990            let entry = match seq {
10991                Sequence::Literals { literals } => (literals.len(), 0usize, 0usize),
10992                Sequence::Triple {
10993                    literals,
10994                    offset,
10995                    match_len,
10996                } => (literals.len(), offset, match_len),
10997            };
10998            sequences.push(entry);
10999        });
11000    }
11001    sequences
11002}
11003
11004#[test]
11005fn hc_sparse_skip_matching_preserves_tail_cross_block_match() {
11006    let mut matcher = HcMatchGenerator::new(1 << 22);
11007    let tail = b"Qz9kLm2Rp";
11008    let mut first = deterministic_high_entropy_bytes(0xD1B5_4A32_9C77_0E19, 4096);
11009    let tail_start = first.len() - tail.len();
11010    first[tail_start..].copy_from_slice(tail);
11011    matcher.table.add_data(first.clone(), |_| {});
11012    matcher.skip_matching(Some(true));
11013
11014    let mut second = tail.to_vec();
11015    second.extend_from_slice(b"after-tail-literals");
11016    matcher.table.add_data(second, |_| {});
11017
11018    let mut first_sequence = None;
11019    matcher.start_matching(|seq| {
11020        if first_sequence.is_some() {
11021            return;
11022        }
11023        first_sequence = Some(match seq {
11024            Sequence::Literals { literals } => (literals.len(), 0usize, 0usize),
11025            Sequence::Triple {
11026                literals,
11027                offset,
11028                match_len,
11029            } => (literals.len(), offset, match_len),
11030        });
11031    });
11032
11033    let (literals_len, offset, match_len) =
11034        first_sequence.expect("expected at least one sequence after sparse skip");
11035    assert_eq!(
11036        literals_len, 0,
11037        "first sequence should start at block boundary"
11038    );
11039    assert_eq!(
11040        offset,
11041        tail.len(),
11042        "first match should reference previous tail"
11043    );
11044    assert!(
11045        match_len >= tail.len(),
11046        "tail-aligned cross-block match must be preserved"
11047    );
11048}
11049
11050#[test]
11051fn btultra2_sparse_skip_matching_preserves_tail_cross_block_match() {
11052    let mut matcher = HcMatchGenerator::new(1 << 20);
11053    matcher.configure(
11054        BTULTRA2_HC_CONFIG_L22,
11055        super::strategy::StrategyTag::BtUltra2,
11056        20,
11057    );
11058    let tail = b"Bt9kLm2Rp";
11059    let mut first = deterministic_high_entropy_bytes(0xA9C3_7F21_D4E8_510B, 4096);
11060    let tail_start = first.len() - tail.len();
11061    first[tail_start..].copy_from_slice(tail);
11062    matcher.table.add_data(first, |_| {});
11063    matcher.skip_matching(Some(true));
11064
11065    let mut second = tail.to_vec();
11066    second.extend_from_slice(b"after-tail-literals");
11067    matcher.table.add_data(second, |_| {});
11068
11069    let mut first_sequence = None;
11070    matcher.start_matching(|seq| {
11071        if first_sequence.is_some() {
11072            return;
11073        }
11074        first_sequence = Some(match seq {
11075            Sequence::Literals { literals } => (literals.len(), 0usize, 0usize),
11076            Sequence::Triple {
11077                literals,
11078                offset,
11079                match_len,
11080            } => (literals.len(), offset, match_len),
11081        });
11082    });
11083
11084    let (literals_len, offset, match_len) =
11085        first_sequence.expect("expected at least one sequence after sparse BT skip");
11086    assert_eq!(
11087        literals_len, 0,
11088        "BT sparse skip should preserve an immediate boundary match"
11089    );
11090    assert_eq!(
11091        offset,
11092        tail.len(),
11093        "first BT match should reference previous tail"
11094    );
11095    assert!(
11096        match_len >= tail.len(),
11097        "BT sparse skip must seed the dense tail for cross-block matching"
11098    );
11099}
11100
11101#[test]
11102fn hc_sparse_skip_matching_does_not_reinsert_sparse_tail_positions() {
11103    let mut matcher = HcMatchGenerator::new(1 << 22);
11104    let first = deterministic_high_entropy_bytes(0xC2B2_AE3D_27D4_EB4F, 4096);
11105    matcher.table.add_data(first.clone(), |_| {});
11106    matcher.skip_matching(Some(true));
11107
11108    let current_len = first.len();
11109    let current_abs_start =
11110        matcher.table.history_abs_start + matcher.table.window_size - current_len;
11111    let current_abs_end = current_abs_start + current_len;
11112    let dense_tail = HC_MIN_MATCH_LEN + INCOMPRESSIBLE_SKIP_STEP;
11113    let tail_start = current_abs_end
11114        .saturating_sub(dense_tail)
11115        .max(matcher.table.history_abs_start)
11116        .max(current_abs_start);
11117
11118    let overlap_pos = (tail_start..current_abs_end)
11119        .find(|&pos| (pos - current_abs_start).is_multiple_of(INCOMPRESSIBLE_SKIP_STEP))
11120        .expect("fixture should contain at least one sparse-grid overlap in dense tail");
11121
11122    let rel = matcher
11123        .table
11124        .relative_position(overlap_pos)
11125        .expect("overlap position should be representable as relative position");
11126    let chain_idx = rel as usize & ((1 << matcher.table.chain_log) - 1);
11127    assert_ne!(
11128        matcher.table.chain_table[chain_idx],
11129        rel + 1,
11130        "sparse-grid tail positions must not be reinserted (self-loop chain entry)"
11131    );
11132}
11133
11134#[test]
11135fn hc_compact_history_drains_when_threshold_crossed() {
11136    let mut hc = HcMatchGenerator::new(8);
11137    hc.table.history = b"abcdefghijklmnopqrstuvwxyz".to_vec();
11138    hc.table.history_start = 16;
11139    hc.table.compact_history();
11140    assert_eq!(hc.table.history_start, 0);
11141    assert_eq!(hc.table.history, b"qrstuvwxyz");
11142}
11143
11144#[test]
11145fn hc_insert_position_no_rebase_returns_when_relative_pos_unavailable() {
11146    let mut hc = HcMatchGenerator::new(32);
11147    hc.table.history = b"abcdefghijklmnop".to_vec();
11148    hc.table.history_abs_start = 0;
11149    hc.table.position_base = 1;
11150    hc.table.ensure_tables();
11151    let before_hash = hc.table.hash_table.clone();
11152    let before_chain = hc.table.chain_table.clone();
11153
11154    hc.table.insert_position_no_rebase(0);
11155
11156    assert_eq!(hc.table.hash_table, before_hash);
11157    assert_eq!(hc.table.chain_table, before_chain);
11158}
11159
11160#[test]
11161fn hc_insert_positions_advances_next_to_update3_for_contiguous_range() {
11162    let mut hc = HcMatchGenerator::new(64);
11163    hc.table.history = b"abcdefghijklmnopqrstuvwxyz".to_vec();
11164    hc.table.history_start = 0;
11165    hc.table.history_abs_start = 0;
11166    hc.table.position_base = 0;
11167    hc.table.ensure_tables();
11168    hc.table.next_to_update3 = 0;
11169
11170    hc.table.insert_positions(0, 9);
11171
11172    assert_eq!(
11173        hc.table.next_to_update3, 9,
11174        "contiguous insert_positions should advance hash3 update cursor"
11175    );
11176}
11177
11178#[test]
11179fn hc_insert_positions_with_step_keeps_next_to_update3_cursor_for_sparse_ranges() {
11180    let mut hc = HcMatchGenerator::new(64);
11181    hc.table.history = b"abcdefghijklmnopqrstuvwxyz".to_vec();
11182    hc.table.history_start = 0;
11183    hc.table.history_abs_start = 0;
11184    hc.table.position_base = 0;
11185    hc.table.ensure_tables();
11186    hc.table.next_to_update3 = 0;
11187
11188    hc.table.insert_positions_with_step(0, 16, 4);
11189
11190    assert_eq!(
11191        hc.table.next_to_update3, 0,
11192        "sparse insert_positions_with_step must not mark skipped positions as hash3-updated"
11193    );
11194}
11195
11196#[cfg(any())]
11197// disabled: tests legacy SuffixStore behavior incompatible with upstream zstd-shape kernel's HASH_READ_SIZE geometry
11198#[test]
11199fn prime_with_dictionary_budget_shrinks_after_simple_eviction() {
11200    let mut driver = MatchGeneratorDriver::new(8, 1);
11201    driver.reset(CompressionLevel::Fastest);
11202    // Use a small live window so dictionary-primed slices are evicted
11203    // quickly and budget retirement can be asserted deterministically.
11204    driver.simple_mut().max_window_size = 8;
11205    driver.reported_window_size = 8;
11206
11207    let base_window = driver.simple_mut().max_window_size;
11208    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
11209    assert_eq!(driver.simple_mut().max_window_size, base_window + 24);
11210
11211    for block in [b"AAAAAAAA", b"BBBBBBBB"] {
11212        let mut space = driver.get_next_space();
11213        space.clear();
11214        space.extend_from_slice(block);
11215        driver.commit_space(space);
11216        driver.skip_matching_with_hint(None);
11217    }
11218
11219    assert_eq!(
11220        driver.dictionary_retained_budget, 0,
11221        "dictionary budget should be fully retired once primed dict slices are evicted"
11222    );
11223    assert_eq!(
11224        driver.simple_mut().max_window_size,
11225        base_window,
11226        "retired dictionary budget must not remain reusable for live history"
11227    );
11228}
11229
11230#[test]
11231fn prime_with_dictionary_budget_shrinks_after_dfast_eviction() {
11232    let mut driver = MatchGeneratorDriver::new(8, 1);
11233    driver.reset(CompressionLevel::Level(3));
11234    // Use a small live window in this regression so dictionary-primed slices are
11235    // evicted quickly and budget retirement can be asserted deterministically.
11236    driver.dfast_matcher_mut().max_window_size = 8;
11237    driver.reported_window_size = 8;
11238
11239    let base_window = driver.dfast_matcher().max_window_size;
11240    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
11241    assert_eq!(driver.dfast_matcher().max_window_size, base_window + 24);
11242
11243    for block in [b"AAAAAAAA", b"BBBBBBBB"] {
11244        let mut space = driver.get_next_space();
11245        space.clear();
11246        space.extend_from_slice(block);
11247        driver.commit_space(space);
11248        driver.skip_matching_with_hint(None);
11249    }
11250
11251    assert_eq!(
11252        driver.dictionary_retained_budget, 0,
11253        "dictionary budget should be fully retired once primed dict slices are evicted"
11254    );
11255    assert_eq!(
11256        driver.dfast_matcher().max_window_size,
11257        base_window,
11258        "retired dictionary budget must not remain reusable for live history"
11259    );
11260}
11261
11262#[test]
11263fn hc_prime_with_dictionary_preserves_history_for_first_full_block() {
11264    let mut driver = MatchGeneratorDriver::new(8, 1);
11265    // Route onto HashChain explicitly — `Better` resolves to the Row
11266    // backend in production, and this test pins HC dict-prime behaviour.
11267    driver.reset_on_hc_lazy(CompressionLevel::Better);
11268
11269    driver.prime_with_dictionary(b"abcdefgh", [1, 4, 8]);
11270
11271    let mut space = driver.get_next_space();
11272    space.clear();
11273    // Repeat the dictionary content so the HC matcher can find it.
11274    // HC_MIN_MATCH_LEN is 5, so an 8-byte match is well above threshold.
11275    space.extend_from_slice(b"abcdefgh");
11276    driver.commit_space(space);
11277
11278    let mut saw_match = false;
11279    driver.start_matching(|seq| {
11280        if let Sequence::Triple {
11281            literals,
11282            offset,
11283            match_len,
11284        } = seq
11285            && literals.is_empty()
11286            && offset == 8
11287            && match_len >= HC_MIN_MATCH_LEN
11288        {
11289            saw_match = true;
11290        }
11291    });
11292
11293    assert!(
11294        saw_match,
11295        "hash-chain backend should match dictionary-primed history in first full block"
11296    );
11297}
11298
11299#[test]
11300fn prime_with_dictionary_budget_shrinks_after_hc_eviction() {
11301    let mut driver = MatchGeneratorDriver::new(8, 1);
11302    driver.reset_on_hc_lazy(CompressionLevel::Better);
11303    // Use a small live window so dictionary-primed slices are evicted quickly.
11304    driver.hc_matcher_mut().table.max_window_size = 8;
11305    driver.reported_window_size = 8;
11306
11307    let base_window = driver.hc_matcher().table.max_window_size;
11308    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
11309    assert_eq!(driver.hc_matcher().table.max_window_size, base_window + 24);
11310
11311    for block in [b"AAAAAAAA", b"BBBBBBBB"] {
11312        let mut space = driver.get_next_space();
11313        space.clear();
11314        space.extend_from_slice(block);
11315        driver.commit_space(space);
11316        driver.skip_matching_with_hint(None);
11317    }
11318
11319    assert_eq!(
11320        driver.dictionary_retained_budget, 0,
11321        "dictionary budget should be fully retired once primed dict slices are evicted"
11322    );
11323    assert_eq!(
11324        driver.hc_matcher().table.max_window_size,
11325        base_window,
11326        "retired dictionary budget must not remain reusable for live history"
11327    );
11328}
11329
11330#[test]
11331fn hc_commit_without_eviction_retires_no_dictionary_budget() {
11332    // Regression: after the window<->history dedup, MatchTable::add_data
11333    // invokes its reuse_space callback for the *input* buffer (recycle),
11334    // not for evicted chunks. The HC arm of commit_space must therefore
11335    // derive eviction bytes from the window_size delta — counting the
11336    // callback argument as evicted would charge the whole committed block
11337    // as "evicted" and prematurely retire dictionary budget even when the
11338    // window is nowhere near full.
11339    let mut driver = MatchGeneratorDriver::new(8, 1);
11340    driver.reset_on_hc_lazy(CompressionLevel::Better);
11341    // A large live window so a small committed block evicts nothing.
11342    driver.hc_matcher_mut().table.max_window_size = 1 << 20;
11343    driver.reported_window_size = 1 << 20;
11344    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
11345    let budget_after_prime = driver.dictionary_retained_budget;
11346    assert!(
11347        budget_after_prime > 0,
11348        "priming must retain a non-zero dictionary budget"
11349    );
11350
11351    let mut space = driver.get_next_space();
11352    space.clear();
11353    space.extend_from_slice(b"AAAAAAAA");
11354    driver.commit_space(space);
11355    driver.skip_matching_with_hint(None);
11356
11357    assert_eq!(
11358        driver.dictionary_retained_budget, budget_after_prime,
11359        "a commit that evicts nothing must retire no dictionary budget"
11360    );
11361}
11362
11363#[test]
11364fn row_commit_without_eviction_retires_no_dictionary_budget() {
11365    // Regression for the Row arm of commit_space after the window ->
11366    // chunk_lens migration: RowMatchGenerator::add_data now invokes its
11367    // reuse_space callback for the *input* buffer (per-commit recycle),
11368    // not for evicted chunks. The Row arm must derive eviction bytes from
11369    // the window_size delta like the Dfast / HashChain arms — counting the
11370    // callback argument as evicted charges the whole committed block as
11371    // "evicted" and prematurely retires dictionary budget even when the
11372    // window is nowhere near full.
11373    let mut driver = MatchGeneratorDriver::new(8, 1);
11374    driver.reset(CompressionLevel::Level(5));
11375    assert!(matches!(driver.storage, MatcherStorage::Row(_)));
11376    // A large live window so a small committed block evicts nothing.
11377    driver.row_matcher_mut().max_window_size = 1 << 20;
11378    driver.reported_window_size = 1 << 20;
11379    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
11380    let budget_after_prime = driver.dictionary_retained_budget;
11381    assert!(
11382        budget_after_prime > 0,
11383        "priming must retain a non-zero dictionary budget"
11384    );
11385
11386    let mut space = driver.get_next_space();
11387    space.clear();
11388    space.extend_from_slice(b"AAAAAAAA");
11389    driver.commit_space(space);
11390    driver.skip_matching_with_hint(None);
11391
11392    assert_eq!(
11393        driver.dictionary_retained_budget, budget_after_prime,
11394        "a Row commit that evicts nothing must retire no dictionary budget"
11395    );
11396}
11397
11398#[test]
11399fn hc_rebases_positions_after_u32_boundary() {
11400    let mut matcher = HcMatchGenerator::new(64);
11401    matcher.table.add_data(b"abcdeabcdeabcde".to_vec(), |_| {});
11402    matcher.table.ensure_tables();
11403    matcher.table.position_base = 0;
11404    let history_abs_start: usize = match (u64::from(u32::MAX) + 64).try_into() {
11405        Ok(value) => value,
11406        Err(_) => return,
11407    };
11408    // Simulate a long-running stream where absolute history positions crossed
11409    // the u32 range. Before #51 this disabled HC inserts entirely.
11410    matcher.table.history_abs_start = history_abs_start;
11411    matcher.skip_matching(None);
11412    assert_eq!(
11413        matcher.table.position_base, matcher.table.history_abs_start,
11414        "rebase should anchor to the oldest live absolute position"
11415    );
11416
11417    assert!(
11418        matcher
11419            .table
11420            .hash_table
11421            .iter()
11422            .any(|entry| *entry != HC_EMPTY),
11423        "HC hash table should still be populated after crossing u32 boundary"
11424    );
11425
11426    // Verify rebasing preserves candidate lookup, not just table population.
11427    let abs_pos = matcher.table.history_abs_start + 10;
11428    let candidates = matcher.hc.chain_candidates(&matcher.table, abs_pos);
11429    assert!(
11430        candidates.iter().any(|candidate| *candidate != usize::MAX),
11431        "chain_candidates should return valid matches after rebase"
11432    );
11433}
11434
11435// 64-bit only: the >4 GiB absolute cursor this test fabricates cannot exist on
11436// a 32-bit target (usize == u32 can't address that much), and setting
11437// `history_abs_start` near `u32::MAX` there overflows `usize` in the
11438// `check_stream_abs_headroom` guard before the rebase path is reached. Mirrors
11439// the `try_into()` early-return guard on `hc_rebases_positions_after_u32_boundary`.
11440#[cfg(target_pointer_width = "64")]
11441#[test]
11442fn row_rebases_positions_after_u32_boundary() {
11443    // Row stores absolute match positions as u32. On a long stream the
11444    // cumulative absolute cursor crosses the u32 range even while the live
11445    // window stays bounded; `add_data` must rebase the coordinate origin
11446    // down to the oldest live byte instead of asserting. Before the rebase
11447    // landed this panicked on the `< u32::MAX` assertion, dropping valid
11448    // long Row-backed frames.
11449    let mut m = RowMatchGenerator::new(64);
11450    m.add_data(b"abcdeabcdeabcde".to_vec(), |_| {});
11451
11452    // Simulate ~4 GiB of stream behind a bounded window: the live bytes now
11453    // sit just under the u32 absolute ceiling.
11454    let near_ceiling = (u32::MAX as usize) - 16;
11455    m.history_abs_start = near_ceiling;
11456
11457    // The next commit would push a u32 position past the ceiling; add_data
11458    // must rebase the origin rather than panic.
11459    m.add_data(b"fghij".to_vec(), |_| {});
11460
11461    assert!(
11462        m.history_abs_start < near_ceiling,
11463        "add_data must rebase the absolute origin down when the cursor nears \
11464         u32::MAX (got {})",
11465        m.history_abs_start
11466    );
11467    assert!(
11468        (m.history_abs_start + m.window_size) < u32::MAX as usize,
11469        "after rebase the live window must fit below the u32 position ceiling"
11470    );
11471}
11472
11473#[test]
11474fn hc_rebase_rebuilds_only_inserted_prefix() {
11475    let mut matcher = HcMatchGenerator::new(64);
11476    matcher.table.add_data(b"abcdeabcdeabcde".to_vec(), |_| {});
11477    matcher.table.ensure_tables();
11478    matcher.table.position_base = 0;
11479    let history_abs_start: usize = match (u64::from(u32::MAX) + 64).try_into() {
11480        Ok(value) => value,
11481        Err(_) => return,
11482    };
11483    matcher.table.history_abs_start = history_abs_start;
11484    let abs_pos = matcher.table.history_abs_start + 6;
11485
11486    let mut expected = HcMatchGenerator::new(64);
11487    expected.table.add_data(b"abcdeabcdeabcde".to_vec(), |_| {});
11488    expected.table.ensure_tables();
11489    expected.table.history_abs_start = history_abs_start;
11490    expected.table.position_base = expected.table.history_abs_start;
11491    expected.table.hash_table.fill(HC_EMPTY);
11492    expected.table.chain_table.fill(HC_EMPTY);
11493    for pos in expected.table.history_abs_start..abs_pos {
11494        expected.table.insert_position_no_rebase(pos);
11495    }
11496
11497    matcher.table.maybe_rebase_positions(abs_pos);
11498
11499    assert_eq!(
11500        matcher.table.position_base, matcher.table.history_abs_start,
11501        "rebase should still anchor to the oldest live absolute position"
11502    );
11503    assert_eq!(
11504        matcher.table.hash_table, expected.table.hash_table,
11505        "rebase must rebuild only positions already inserted before abs_pos"
11506    );
11507    assert_eq!(
11508        matcher.table.chain_table, expected.table.chain_table,
11509        "future positions must not be pre-seeded into HC chains during rebase"
11510    );
11511}
11512
11513#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
11514#[test]
11515fn suffix_store_with_single_slot_does_not_panic_on_keying() {
11516    let mut suffixes = SuffixStore::with_capacity(1);
11517    suffixes.insert(b"abcde", 0);
11518    assert!(suffixes.contains_key(b"abcde"));
11519    assert_eq!(suffixes.get(b"abcde"), Some(0));
11520}
11521
11522#[cfg(any())]
11523// disabled: hash_fill_step is a legacy MatchGenerator field; FastKernelMatcher walks stride=1 today
11524#[test]
11525fn fastest_reset_uses_interleaved_hash_fill_step() {
11526    let mut driver = MatchGeneratorDriver::new(32, 2);
11527
11528    driver.reset(CompressionLevel::Uncompressed);
11529    assert_eq!(driver.simple().hash_fill_step, 1);
11530
11531    driver.reset(CompressionLevel::Fastest);
11532    assert_eq!(driver.simple().hash_fill_step, FAST_HASH_FILL_STEP);
11533
11534    // Better uses the HashChain backend with lazy2; verify that the backend switch
11535    // happened and the lazy_depth is configured correctly.
11536    driver.reset(CompressionLevel::Better);
11537    assert_eq!(
11538        driver.active_backend(),
11539        super::strategy::BackendTag::HashChain
11540    );
11541    assert_eq!(driver.window_size(), (1u64 << 23));
11542    assert_eq!(driver.hc_matcher().hc.lazy_depth, 2);
11543}
11544
11545#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
11546#[test]
11547fn simple_matcher_updates_offset_history_after_emitting_match() {
11548    let mut matcher = MatchGenerator::new(64);
11549    matcher.add_data(
11550        b"abcdeabcdeabcde".to_vec(),
11551        SuffixStore::with_capacity(64),
11552        |_, _| {},
11553    );
11554
11555    assert!(matcher.next_sequence(|seq| {
11556        assert_eq!(
11557            seq,
11558            Sequence::Triple {
11559                literals: b"abcde",
11560                offset: 5,
11561                match_len: 10,
11562            }
11563        );
11564    }));
11565    assert_eq!(matcher.offset_hist, [5, 1, 4]);
11566}
11567
11568#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
11569#[test]
11570fn simple_matcher_zero_literal_repcode_checks_rep1_before_hash_lookup() {
11571    let mut matcher = MatchGenerator::new(64);
11572    matcher.add_data(
11573        b"abcdefghijabcdefghij".to_vec(),
11574        SuffixStore::with_capacity(64),
11575        |_, _| {},
11576    );
11577
11578    matcher.suffix_idx = 10;
11579    matcher.last_idx_in_sequence = 10;
11580    matcher.offset_hist = [99, 10, 4];
11581
11582    let candidate = matcher.repcode_candidate(&matcher.window.last().unwrap().data[10..], 0);
11583    assert_eq!(candidate, Some((10, 10)));
11584}
11585
11586#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
11587#[test]
11588fn simple_matcher_repcode_can_target_previous_window_entry() {
11589    let mut matcher = MatchGenerator::new(64);
11590    matcher.add_data(
11591        b"abcdefghij".to_vec(),
11592        SuffixStore::with_capacity(64),
11593        |_, _| {},
11594    );
11595    matcher.skip_matching();
11596    matcher.add_data(
11597        b"abcdefghij".to_vec(),
11598        SuffixStore::with_capacity(64),
11599        |_, _| {},
11600    );
11601
11602    matcher.offset_hist = [99, 10, 4];
11603
11604    let candidate = matcher.repcode_candidate(&matcher.window.last().unwrap().data, 0);
11605    assert_eq!(candidate, Some((10, 10)));
11606}
11607
11608#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
11609#[test]
11610fn simple_matcher_zero_literal_repcode_checks_rep2() {
11611    let mut matcher = MatchGenerator::new(64);
11612    matcher.add_data(
11613        b"abcdefghijabcdefghij".to_vec(),
11614        SuffixStore::with_capacity(64),
11615        |_, _| {},
11616    );
11617    matcher.suffix_idx = 10;
11618    matcher.last_idx_in_sequence = 10;
11619    // rep1=4 does not match at idx 10, rep2=10 does.
11620    matcher.offset_hist = [99, 4, 10];
11621
11622    let candidate = matcher.repcode_candidate(&matcher.window.last().unwrap().data[10..], 0);
11623    assert_eq!(candidate, Some((10, 10)));
11624}
11625
11626#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
11627#[test]
11628fn simple_matcher_zero_literal_repcode_checks_rep0_minus1() {
11629    let mut matcher = MatchGenerator::new(64);
11630    matcher.add_data(
11631        b"abcdefghijabcdefghij".to_vec(),
11632        SuffixStore::with_capacity(64),
11633        |_, _| {},
11634    );
11635    matcher.suffix_idx = 10;
11636    matcher.last_idx_in_sequence = 10;
11637    // rep1=4 and rep2=99 do not match; rep0-1 == 10 does.
11638    matcher.offset_hist = [11, 4, 99];
11639
11640    let candidate = matcher.repcode_candidate(&matcher.window.last().unwrap().data[10..], 0);
11641    assert_eq!(candidate, Some((10, 10)));
11642}
11643
11644#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
11645#[test]
11646fn simple_matcher_repcode_rejects_offsets_beyond_searchable_prefix() {
11647    let mut matcher = MatchGenerator::new(64);
11648    matcher.add_data(
11649        b"abcdefghij".to_vec(),
11650        SuffixStore::with_capacity(64),
11651        |_, _| {},
11652    );
11653    matcher.skip_matching();
11654    matcher.add_data(
11655        b"klmnopqrst".to_vec(),
11656        SuffixStore::with_capacity(64),
11657        |_, _| {},
11658    );
11659    matcher.suffix_idx = 3;
11660
11661    let candidate = matcher.offset_match_len(14, &matcher.window.last().unwrap().data[3..]);
11662    assert_eq!(candidate, None);
11663}
11664
11665#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
11666#[test]
11667fn simple_matcher_skip_matching_seeds_every_position_even_with_fast_step() {
11668    let mut matcher = MatchGenerator::new(64);
11669    matcher.hash_fill_step = FAST_HASH_FILL_STEP;
11670    matcher.add_data(
11671        b"abcdefghijklmnop".to_vec(),
11672        SuffixStore::with_capacity(64),
11673        |_, _| {},
11674    );
11675    matcher.skip_matching();
11676    matcher.add_data(b"bcdef".to_vec(), SuffixStore::with_capacity(64), |_, _| {});
11677
11678    assert!(matcher.next_sequence(|seq| {
11679        assert_eq!(
11680            seq,
11681            Sequence::Triple {
11682                literals: b"",
11683                offset: 15,
11684                match_len: 5,
11685            }
11686        );
11687    }));
11688    assert!(!matcher.next_sequence(|_| {}));
11689}
11690
11691#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
11692#[test]
11693fn simple_matcher_skip_matching_with_incompressible_hint_uses_sparse_prefix() {
11694    let mut matcher = MatchGenerator::new(128);
11695    let first = b"abcdefghijklmnopqrstuvwxyz012345".to_vec();
11696    let sparse_probe = first[3..3 + MIN_MATCH_LEN].to_vec();
11697    let tail_start = first.len() - MIN_MATCH_LEN;
11698    let tail_probe = first[tail_start..tail_start + MIN_MATCH_LEN].to_vec();
11699    matcher.add_data(first, SuffixStore::with_capacity(256), |_, _| {});
11700
11701    matcher.skip_matching_with_hint(Some(true));
11702
11703    // Observable behavior check: sparse-prefix probe should not immediately match.
11704    matcher.add_data(sparse_probe, SuffixStore::with_capacity(256), |_, _| {});
11705    let mut sparse_first_is_literals = None;
11706    assert!(matcher.next_sequence(|seq| {
11707        if sparse_first_is_literals.is_none() {
11708            sparse_first_is_literals = Some(matches!(seq, Sequence::Literals { .. }));
11709        }
11710    }));
11711    assert!(
11712        sparse_first_is_literals.unwrap_or(false),
11713        "sparse-start probe should not produce an immediate match"
11714    );
11715
11716    // Dense tail remains indexed for cross-block boundary matching.
11717    let mut matcher = MatchGenerator::new(128);
11718    matcher.add_data(
11719        b"abcdefghijklmnopqrstuvwxyz012345".to_vec(),
11720        SuffixStore::with_capacity(256),
11721        |_, _| {},
11722    );
11723    matcher.skip_matching_with_hint(Some(true));
11724    matcher.add_data(tail_probe, SuffixStore::with_capacity(256), |_, _| {});
11725    let mut tail_first_is_immediate_match = None;
11726    assert!(matcher.next_sequence(|seq| {
11727        if tail_first_is_immediate_match.is_none() {
11728            tail_first_is_immediate_match =
11729                Some(matches!(seq, Sequence::Triple { literals, .. } if literals.is_empty()));
11730        }
11731    }));
11732    assert!(
11733        tail_first_is_immediate_match.unwrap_or(false),
11734        "dense tail probe should match immediately at block start"
11735    );
11736}
11737
11738#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
11739#[test]
11740fn simple_matcher_add_suffixes_till_backfills_last_searchable_anchor() {
11741    let mut matcher = MatchGenerator::new(64);
11742    matcher.hash_fill_step = FAST_HASH_FILL_STEP;
11743    matcher.add_data(
11744        b"01234abcde".to_vec(),
11745        SuffixStore::with_capacity(64),
11746        |_, _| {},
11747    );
11748    matcher.add_suffixes_till(10, FAST_HASH_FILL_STEP);
11749
11750    let last = matcher.window.last().unwrap();
11751    let tail = &last.data[5..10];
11752    assert_eq!(last.suffixes.get(tail), Some(5));
11753}
11754
11755#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
11756#[test]
11757fn simple_matcher_add_suffixes_till_skips_when_idx_below_min_match_len() {
11758    let mut matcher = MatchGenerator::new(128);
11759    matcher.hash_fill_step = FAST_HASH_FILL_STEP;
11760    matcher.add_data(
11761        b"abcdefghijklmnopqrstuvwxyz".to_vec(),
11762        SuffixStore::with_capacity(1 << 16),
11763        |_, _| {},
11764    );
11765
11766    matcher.add_suffixes_till(MIN_MATCH_LEN - 1, FAST_HASH_FILL_STEP);
11767
11768    let last = matcher.window.last().unwrap();
11769    let first_key = &last.data[..MIN_MATCH_LEN];
11770    assert_eq!(last.suffixes.get(first_key), None);
11771}
11772
11773#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
11774#[test]
11775fn simple_matcher_add_suffixes_till_fast_step_registers_interleaved_positions() {
11776    let mut matcher = MatchGenerator::new(128);
11777    matcher.hash_fill_step = FAST_HASH_FILL_STEP;
11778    matcher.add_data(
11779        b"abcdefghijklmnopqrstuvwxyz".to_vec(),
11780        SuffixStore::with_capacity(1 << 16),
11781        |_, _| {},
11782    );
11783
11784    matcher.add_suffixes_till(17, FAST_HASH_FILL_STEP);
11785
11786    let last = matcher.window.last().unwrap();
11787    for pos in [0usize, 3, 6, 9, 12] {
11788        let key = &last.data[pos..pos + MIN_MATCH_LEN];
11789        assert_eq!(
11790            last.suffixes.get(key),
11791            Some(pos),
11792            "expected interleaved suffix registration at pos {pos}"
11793        );
11794    }
11795}
11796
11797#[test]
11798fn dfast_skip_matching_handles_window_eviction() {
11799    let mut matcher = DfastMatchGenerator::new(16);
11800
11801    matcher.add_data(alloc::vec![1, 2, 3, 4, 5, 6], |_| {});
11802    matcher.skip_matching(None);
11803    matcher.add_data(alloc::vec![7, 8, 9, 10, 11, 12], |_| {});
11804    matcher.skip_matching(None);
11805    matcher.add_data(alloc::vec![7, 8, 9, 10, 11, 12], |_| {});
11806
11807    let mut reconstructed = alloc::vec![7, 8, 9, 10, 11, 12];
11808    matcher.start_matching(|seq| match seq {
11809        Sequence::Literals { literals } => reconstructed.extend_from_slice(literals),
11810        Sequence::Triple {
11811            literals,
11812            offset,
11813            match_len,
11814        } => {
11815            reconstructed.extend_from_slice(literals);
11816            let start = reconstructed.len() - offset;
11817            for i in 0..match_len {
11818                let byte = reconstructed[start + i];
11819                reconstructed.push(byte);
11820            }
11821        }
11822    });
11823
11824    assert_eq!(reconstructed, [7, 8, 9, 10, 11, 12, 7, 8, 9, 10, 11, 12]);
11825}
11826
11827#[test]
11828fn dfast_add_data_callback_reports_evicted_len_not_capacity() {
11829    let mut matcher = DfastMatchGenerator::new(8);
11830
11831    let mut first = Vec::with_capacity(64);
11832    first.extend_from_slice(b"abcdefgh");
11833    matcher.add_data(first, |_| {});
11834
11835    let mut second = Vec::with_capacity(64);
11836    second.extend_from_slice(b"ijklmnop");
11837
11838    let mut observed_evicted_len = None;
11839    matcher.add_data(second, |data| {
11840        observed_evicted_len = Some(data.len());
11841    });
11842
11843    assert_eq!(
11844        observed_evicted_len,
11845        Some(8),
11846        "eviction callback must report evicted byte length, not backing capacity"
11847    );
11848}
11849
11850/// Regression for the `commit_space` Dfast-branch eviction accounting bug
11851/// (CodeRabbit Critical on PR #146). Old code counted the INPUT buffer
11852/// length as `evicted_bytes` because Dfast's `add_data` callback receives
11853/// the input `Vec<u8>` for pool recycling (Dfast stores bytes in `history`,
11854/// not per-block Vecs). On the saturated-window 1:1 path the two coincide
11855/// so the previous test fixture passed by accident; this test forces the
11856/// divergent case where evicted != input by sequencing block lengths
11857/// `[4, 4, 5]` against `max_window_size = 10`:
11858///
11859///   * after 1st commit: `window_blocks = [4]`, `window_size = 4`
11860///   * after 2nd commit: `window_blocks = [4, 4]`, `window_size = 8`
11861///   * 3rd commit (5 bytes): `8 + 5 > 10` → pop one 4-byte block (evict=4),
11862///     then push 5 (window_size=9). Bug counts `5`, fix counts `4`.
11863///
11864/// The fix derives eviction from `window_size` delta + input length:
11865/// `evicted = pre + space_len - post`. Verified via the
11866/// `dictionary_retained_budget` observable: starting budget 100, after
11867/// the third commit (4 bytes actually evicted) the budget must read 96,
11868/// not 95.
11869/// Driver-path regression for the `commit_space` Dfast eviction accounting
11870/// bug. Exercises `MatchGeneratorDriver::commit_space` directly (not just
11871/// `DfastMatchGenerator::add_data`) so the assertion catches a future
11872/// regression that swaps the Dfast branch in `commit_space` back to
11873/// `evicted_bytes += data.len()` — the older draft of this regression
11874/// hand-recomputed the formula on the matcher and would pass either way.
11875///
11876/// Fixture: `max_window_size = 10`, commit sequence `[4, 4, 5]`. The
11877/// divergent case where the popped block (4 bytes) and the new input
11878/// (5 bytes) have different sizes:
11879///
11880///   * after commit `"abcd"` (4 B): window_blocks=[4], ws=4
11881///   * after commit `"efgh"` (4 B): window_blocks=[4,4], ws=8
11882///   * commit `"ijklm"` (5 B): 8+5>10 → pop front [4] (evict=4),
11883///     push 5 → window_blocks=[4,5], ws=9
11884///
11885/// `commit_space` then calls `retire_dictionary_budget(evicted)`. With
11886/// the fix `evicted=4`; with the bug it would be `evicted=5`. The
11887/// downstream `trim_after_budget_retire` cascade (which fires whenever
11888/// `retire_dictionary_budget` returns true) drives the budget further
11889/// down by trimming the now-oversize window; the final
11890/// `dictionary_retained_budget` differs between the two paths because
11891/// the cascade starting state differs (max_window_size after first
11892/// retire is `10 - evicted`).
11893///
11894/// Tracing the fix path end-to-end with starting budget = 100:
11895///   1st commit: evicted=0, no retire.
11896///   2nd commit: evicted=0, no retire.
11897///   3rd commit: evicted=4. retire(4) → budget=96, max_window=6.
11898///     trim_after_budget_retire:
11899///       iter1: ws=9 > max=6, pop [4] → ws=5, evicted=4.
11900///              retire(4) → budget=92, max_window=2.
11901///       iter2: ws=5 > max=2, pop [5] → ws=0, evicted=5.
11902///              retire(5) → budget=87, max_window=0.
11903///       iter3: ws=0, no trim, retire(0) → false, exit.
11904///   Final budget = 87. Final max_window_size = 0.
11905///
11906/// In the buggy path the 3rd commit would compute `evicted=5`, retire
11907/// would reclaim 5 instead of 4, shrinking max_window_size to 5
11908/// instead of 6 — and then the cascade arithmetic produces a
11909/// different final budget (and on the 2nd commit the cascade would
11910/// already have shrunk max_window_size to 0, causing the 3rd commit
11911/// to panic on `data.len() <= max_window_size`). Either way the
11912/// regression surfaces as a test failure.
11913#[test]
11914fn dfast_commit_space_eviction_uses_window_size_delta() {
11915    use crate::encoding::CompressionLevel;
11916
11917    let mut driver = MatchGeneratorDriver::new(10, 1);
11918    driver.reset(CompressionLevel::Level(3));
11919    assert!(matches!(driver.storage, MatcherStorage::Dfast(_)));
11920
11921    // Override the level-derived window with a tiny one so the
11922    // 4 + 4 + 5 = 13 commit sequence below actually crosses the
11923    // boundary. A 16 KiB+ default window would never evict on this
11924    // little data and the bug would stay invisible.
11925    driver.dfast_matcher_mut().max_window_size = 10;
11926    driver.dictionary_retained_budget = 100;
11927
11928    let mut space1 = Vec::with_capacity(64);
11929    space1.extend_from_slice(b"abcd");
11930    driver.commit_space(space1);
11931    assert_eq!(
11932        driver.dictionary_retained_budget, 100,
11933        "1st commit fills window 0 → 4, no eviction, no retire"
11934    );
11935
11936    let mut space2 = Vec::with_capacity(64);
11937    space2.extend_from_slice(b"efgh");
11938    driver.commit_space(space2);
11939    assert_eq!(
11940        driver.dictionary_retained_budget, 100,
11941        "2nd commit fills window 4 → 8, no eviction, no retire"
11942    );
11943
11944    let mut space3 = Vec::with_capacity(64);
11945    space3.extend_from_slice(b"ijklm");
11946    driver.commit_space(space3);
11947    assert_eq!(
11948        driver.dictionary_retained_budget, 87,
11949        "3rd commit + trim_after_budget_retire cascade. With the fix \
11950         (evicted=4 from window_size delta) the cascade reclaims 100 \
11951         → 96 → 92 → 87. With the bug (evicted=5 from data.len()) the \
11952         3rd commit would panic on `data.len() <= max_window_size` \
11953         after the 2nd commit's cascade had already shrunk \
11954         max_window_size to 0."
11955    );
11956    assert_eq!(
11957        driver.dfast_matcher_mut().max_window_size,
11958        0,
11959        "cascade drains max_window_size to 0 once budget reclaim \
11960         exceeds the initial window size"
11961    );
11962}
11963
11964#[test]
11965fn dfast_trim_to_window_evicts_oldest_block_by_length() {
11966    // After the history-only storage refactor (#111 Phase 7c step 3),
11967    // Dfast no longer retains input `Vec<u8>`s — the `history`
11968    // contiguous buffer is the sole byte store, and `add_data`
11969    // returns the input Vec to the caller's pool eagerly. So
11970    // `trim_to_window` doesn't have anything to hand back to the
11971    // closure (no Vec exists to give). The eviction is observable
11972    // instead through `window_size` shrinking by the per-block
11973    // length recorded in `window_blocks`.
11974    let mut matcher = DfastMatchGenerator::new(16);
11975
11976    let mut first = Vec::with_capacity(64);
11977    first.extend_from_slice(b"abcdefgh");
11978    matcher.add_data(first, |_| {});
11979
11980    let mut second = Vec::with_capacity(64);
11981    second.extend_from_slice(b"ijklmnop");
11982    matcher.add_data(second, |_| {});
11983
11984    assert_eq!(matcher.window_size, 16);
11985    assert_eq!(matcher.window_blocks.len(), 2);
11986
11987    matcher.max_window_size = 8;
11988
11989    matcher.trim_to_window();
11990
11991    // No callback signature to assert on: the Dfast variant of
11992    // `trim_to_window` takes none. That signature shape (vs HC/Row
11993    // which accept `impl FnMut(Vec<u8>)`) is the property locking in
11994    // the contract — there is no closure to invoke or skip, so no
11995    // future change can "start invoking the callback" without a
11996    // compile-time signature break that the dispatcher and this test
11997    // would force the author to address.
11998    assert_eq!(
11999        matcher.window_size, 8,
12000        "exactly one 8-byte block must remain"
12001    );
12002    assert_eq!(matcher.window_blocks.len(), 1);
12003    assert_eq!(matcher.history_abs_start, 8);
12004}
12005
12006#[test]
12007fn dfast_inserts_tail_positions_for_next_block_matching() {
12008    let mut matcher = DfastMatchGenerator::new(1 << 22);
12009
12010    matcher.add_data(b"012345bcdea".to_vec(), |_| {});
12011    let mut history = Vec::new();
12012    matcher.start_matching(|seq| match seq {
12013        Sequence::Literals { literals } => history.extend_from_slice(literals),
12014        Sequence::Triple { .. } => unreachable!("first block should not match history"),
12015    });
12016    assert_eq!(history, b"012345bcdea");
12017
12018    matcher.add_data(b"bcdeabcdeab".to_vec(), |_| {});
12019    let mut saw_first_sequence = false;
12020    matcher.start_matching(|seq| {
12021        assert!(!saw_first_sequence, "expected a single cross-block match");
12022        saw_first_sequence = true;
12023        match seq {
12024            Sequence::Literals { .. } => {
12025                panic!("expected tail-anchored cross-block match before any literals")
12026            }
12027            Sequence::Triple {
12028                literals,
12029                offset,
12030                match_len,
12031            } => {
12032                assert_eq!(literals, b"");
12033                assert_eq!(offset, 5);
12034                assert_eq!(match_len, 11);
12035                let start = history.len() - offset;
12036                for i in 0..match_len {
12037                    let byte = history[start + i];
12038                    history.push(byte);
12039                }
12040            }
12041        }
12042    });
12043
12044    assert!(
12045        saw_first_sequence,
12046        "expected tail-anchored cross-block match"
12047    );
12048    assert_eq!(history, b"012345bcdeabcdeabcdeab");
12049}
12050
12051/// Regression for #49 — locks down `MatchTable::backfill_boundary_positions`
12052/// for the [`HcMatchGenerator`] lazy path. `backfill_boundary_positions`
12053/// seeds ONLY the last `< 4` bytes of the previous slice (positions in
12054/// `[current_abs_start - 3, current_abs_start)`) — the bytes that
12055/// `insert_position` could not hash at the time because hashing needs
12056/// 4 bytes of lookahead. The existing 8 MiB window roundtrip test
12057/// exercises cross-slice behaviour end-to-end, but does not isolate
12058/// the backfill of those final 1-3 unhashable bytes.
12059///
12060/// Fixture is built so the cross-block match's candidate position
12061/// MUST lie in `[block_1_end - 3, block_1_end)`:
12062///
12063/// - Block 1 = `b"PQRSTBCD"` (8 bytes). Block 1's `start_matching`
12064///   hashes positions 0..=4 (each has 4 bytes of forward context);
12065///   positions 5/6/7 are the unhashable tail.
12066/// - Block 2 = `b"BCDBCDBCDB"` (10 bytes). At absolute position 8
12067///   (block 2 start) the 4-byte window is `b"BCDB"`. The ONLY place
12068///   `b"BCDB"` was inserted in the hash + chain tables is position 5
12069///   — via `backfill_boundary_positions` on the next-slice entry
12070///   (the 4-byte window at position 5 is `data[5..9] = b"BCD" +
12071///   block_2[0] = b"BCDB"`).
12072///
12073/// If `backfill_boundary_positions` regresses, position 5 is never
12074/// hashed, position 8's lookup misses, and the lazy parser falls
12075/// through to a leading literals run — `offset == 3, match_len >= 4`
12076/// would no longer hold.
12077#[test]
12078fn hashchain_inserts_tail_positions_for_next_block_matching() {
12079    let mut matcher = HcMatchGenerator::new(1 << 22);
12080    matcher.configure(HC_CONFIG, super::strategy::StrategyTag::Lazy, 22);
12081
12082    matcher.table.add_data(b"PQRSTBCD".to_vec(), |_| {});
12083    let mut history = alloc::vec::Vec::new();
12084    matcher.start_matching(|seq| match seq {
12085        Sequence::Literals { literals } => history.extend_from_slice(literals),
12086        Sequence::Triple { .. } => unreachable!("first block has no internal repeats"),
12087    });
12088    assert_eq!(history, b"PQRSTBCD");
12089
12090    matcher.table.add_data(b"BCDBCDBCDB".to_vec(), |_| {});
12091    let mut first_sequence_offset: Option<usize> = None;
12092    let mut first_sequence_match_len: Option<usize> = None;
12093    matcher.start_matching(|seq| {
12094        if first_sequence_offset.is_some() {
12095            return;
12096        }
12097        match seq {
12098            Sequence::Literals { .. } => {
12099                panic!(
12100                    "expected tail-anchored cross-block match before any literals — \
12101                     backfill_boundary_positions did not seed positions 5/6/7"
12102                )
12103            }
12104            Sequence::Triple {
12105                literals,
12106                offset,
12107                match_len,
12108            } => {
12109                assert_eq!(literals, b"", "no leading literals on the boundary match");
12110                first_sequence_offset = Some(offset);
12111                first_sequence_match_len = Some(match_len);
12112            }
12113        }
12114    });
12115
12116    let offset = first_sequence_offset.expect(
12117        "expected tail-anchored cross-block match emitted from backfill_boundary_positions",
12118    );
12119    assert!(
12120        (1..=3).contains(&offset),
12121        "boundary match offset {offset} must point into the unhashable tail \
12122         (positions 5/6/7 of an 8-byte block 1) so the test specifically \
12123         locks down backfill_boundary_positions",
12124    );
12125    assert_eq!(
12126        offset, 3,
12127        "candidate position must land at 5 (= block_1_len - 3) so the 4-byte \
12128         window `data[5..9] = b\"BCDB\"` matches block 2's first hash lookup",
12129    );
12130    let match_len = first_sequence_match_len.unwrap();
12131    assert!(
12132        match_len >= HC_MIN_MATCH_LEN,
12133        "match_len {match_len} must clear the HC min-match floor",
12134    );
12135}
12136
12137#[test]
12138fn dfast_dense_skip_matching_backfills_previous_tail_for_next_block() {
12139    let mut matcher = DfastMatchGenerator::new(1 << 22);
12140    let tail = b"Qz9kLm2Rp";
12141    let mut first = b"0123456789abcdef".to_vec();
12142    first.extend_from_slice(tail);
12143    matcher.add_data(first.clone(), |_| {});
12144    matcher.skip_matching(Some(false));
12145
12146    let mut second = tail.to_vec();
12147    second.extend_from_slice(b"after-tail-literals");
12148    matcher.add_data(second, |_| {});
12149
12150    let mut first_sequence = None;
12151    matcher.start_matching(|seq| {
12152        if first_sequence.is_some() {
12153            return;
12154        }
12155        first_sequence = Some(match seq {
12156            Sequence::Literals { literals } => (literals.len(), 0usize, 0usize),
12157            Sequence::Triple {
12158                literals,
12159                offset,
12160                match_len,
12161            } => (literals.len(), offset, match_len),
12162        });
12163    });
12164
12165    let (lit_len, offset, match_len) = first_sequence.expect("expected at least one sequence");
12166    assert_eq!(
12167        lit_len, 0,
12168        "expected immediate cross-block match at block start"
12169    );
12170    assert_eq!(
12171        offset,
12172        tail.len(),
12173        "expected dense skip to preserve cross-boundary tail match"
12174    );
12175    assert!(
12176        match_len >= DFAST_MIN_MATCH_LEN,
12177        "match length should satisfy dfast minimum match length"
12178    );
12179}
12180
12181#[test]
12182fn dfast_sparse_skip_matching_preserves_tail_cross_block_match() {
12183    let mut matcher = DfastMatchGenerator::new(1 << 22);
12184    let tail = b"Qz9kLm2Rp";
12185    let mut first = deterministic_high_entropy_bytes(0x9E37_79B9_7F4A_7C15, 4096);
12186    let tail_start = first.len() - tail.len();
12187    first[tail_start..].copy_from_slice(tail);
12188    matcher.add_data(first.clone(), |_| {});
12189
12190    matcher.skip_matching(Some(true));
12191
12192    let mut second = tail.to_vec();
12193    second.extend_from_slice(b"after-tail-literals");
12194    matcher.add_data(second, |_| {});
12195
12196    let mut first_sequence = None;
12197    matcher.start_matching(|seq| {
12198        if first_sequence.is_some() {
12199            return;
12200        }
12201        first_sequence = Some(match seq {
12202            Sequence::Literals { literals } => (literals.len(), 0usize, 0usize),
12203            Sequence::Triple {
12204                literals,
12205                offset,
12206                match_len,
12207            } => (literals.len(), offset, match_len),
12208        });
12209    });
12210
12211    let (lit_len, offset, match_len) = first_sequence.expect("expected at least one sequence");
12212    assert_eq!(
12213        lit_len, 0,
12214        "expected immediate cross-block match at block start"
12215    );
12216    assert_eq!(
12217        offset,
12218        tail.len(),
12219        "expected match against densely seeded tail"
12220    );
12221    assert!(
12222        match_len >= DFAST_MIN_MATCH_LEN,
12223        "match length should satisfy dfast minimum match length"
12224    );
12225}
12226
12227#[test]
12228fn dfast_skip_matching_dense_backfills_newly_hashable_long_tail_positions() {
12229    let mut matcher = DfastMatchGenerator::new(1 << 22);
12230    let first = deterministic_high_entropy_bytes(0x7A64_0315_D4E1_91C3, 4096);
12231    let first_len = first.len();
12232    matcher.add_data(first, |_| {});
12233    matcher.skip_matching_dense();
12234
12235    // Appending one byte makes exactly the previous block's last 7 starts
12236    // newly eligible for 8-byte long-hash insertion.
12237    matcher.add_data(alloc::vec![0xAB], |_| {});
12238    matcher.skip_matching_dense();
12239
12240    let target_abs_pos = first_len - 7;
12241    let target_rel = target_abs_pos - matcher.history_abs_start;
12242    let live = matcher.live_history();
12243    assert!(
12244        target_rel + 8 <= live.len(),
12245        "fixture must make the boundary start long-hashable"
12246    );
12247    let long_hash = matcher.long_hash_index(&live[target_rel..]);
12248    let target_slot = matcher.pack_slot(target_abs_pos);
12249    // Single-slot tables (upstream zstd parity): the bucket holds at most one
12250    // u32; the assertion below is a direct equality (no `.contains`).
12251    assert_ne!(
12252        target_slot, DFAST_EMPTY_SLOT,
12253        "pack_slot must never return the empty-slot sentinel for a real position"
12254    );
12255    assert_eq!(
12256        matcher.long_hash[long_hash], target_slot,
12257        "dense skip must seed long-hash entry for newly hashable boundary start"
12258    );
12259}
12260
12261#[test]
12262fn dfast_seed_remaining_hashable_starts_seeds_last_short_hash_positions() {
12263    let mut matcher = DfastMatchGenerator::new(1 << 20);
12264    let block = deterministic_high_entropy_bytes(0x13F0_9A6D_55CE_7B21, 64);
12265    matcher.add_data(block, |_| {});
12266    matcher.ensure_hash_tables();
12267
12268    let current_len = matcher.window_blocks.back().copied().unwrap_or(0);
12269    let current_abs_start = matcher.history_abs_start + matcher.window_size - current_len;
12270    let seed_start = current_len - DFAST_MIN_MATCH_LEN;
12271    matcher.seed_remaining_hashable_starts(current_abs_start, current_len, seed_start);
12272
12273    let target_abs_pos = current_abs_start + current_len - 5;
12274    let target_rel = target_abs_pos - matcher.history_abs_start;
12275    let live = matcher.live_history();
12276    assert!(
12277        target_rel + 5 <= live.len(),
12278        "fixture must leave the last short-hash start valid"
12279    );
12280    let short_hash = matcher.short_hash_index(&live[target_rel..]);
12281    let target_slot = matcher.pack_slot(target_abs_pos);
12282    assert_ne!(
12283        target_slot, DFAST_EMPTY_SLOT,
12284        "pack_slot must never return the empty-slot sentinel for a real position"
12285    );
12286    assert_eq!(
12287        matcher.short_hash[short_hash], target_slot,
12288        "tail seeding must include the last 5-byte-hashable start"
12289    );
12290}
12291
12292#[test]
12293fn dfast_seed_remaining_hashable_starts_handles_pos_at_block_end() {
12294    let mut matcher = DfastMatchGenerator::new(1 << 20);
12295    let block = deterministic_high_entropy_bytes(0x7BB2_DA91_441E_C0EF, 64);
12296    matcher.add_data(block, |_| {});
12297    matcher.ensure_hash_tables();
12298
12299    let current_len = matcher.window_blocks.back().copied().unwrap_or(0);
12300    let current_abs_start = matcher.history_abs_start + matcher.window_size - current_len;
12301    matcher.seed_remaining_hashable_starts(current_abs_start, current_len, current_len);
12302
12303    let target_abs_pos = current_abs_start + current_len - 5;
12304    let target_rel = target_abs_pos - matcher.history_abs_start;
12305    let live = matcher.live_history();
12306    assert!(
12307        target_rel + 5 <= live.len(),
12308        "fixture must leave the last short-hash start valid"
12309    );
12310    let short_hash = matcher.short_hash_index(&live[target_rel..]);
12311    let target_slot = matcher.pack_slot(target_abs_pos);
12312    assert_ne!(
12313        target_slot, DFAST_EMPTY_SLOT,
12314        "pack_slot must never return the empty-slot sentinel for a real position"
12315    );
12316    assert_eq!(
12317        matcher.short_hash[short_hash], target_slot,
12318        "tail seeding must still include the last 5-byte-hashable start when pos is at block end"
12319    );
12320}
12321
12322/// `ensure_room_for` must trigger `reduce()` when the requested
12323/// absolute position would push a relative offset past
12324/// `u32::MAX - DFAST_REBASE_GUARD_BAND`. After the rebase, the
12325/// pre-existing entry at a much-smaller absolute position falls
12326/// below `reducer` and gets cleared to `DFAST_EMPTY_SLOT`; a fresh
12327/// insert at the boundary position must `pack_slot` to a valid
12328/// non-sentinel value that `unpack_slot` resolves back to the same
12329/// absolute position. Mirrors `LdmHashTable::ensure_room_for_*`
12330/// from PR #139.
12331///
12332/// Runs on every target — `trigger_abs = u32::MAX -
12333/// DFAST_REBASE_GUARD_BAND + 1 = 0xC0000000`, which fits in `usize`
12334/// on i686 (`usize::MAX = u32::MAX`) without overflow, so the
12335/// packed-slot boundary path + u32 ↔ usize round-trip is exercised
12336/// on every pointer width we ship.
12337#[test]
12338fn dfast_ensure_room_for_rebases_above_guard_band() {
12339    let mut dfast = DfastMatchGenerator::new(1 << 22);
12340    dfast.set_hash_bits(10, 10);
12341    dfast.ensure_hash_tables();
12342
12343    // Seed an early insert near the current base in BOTH tables.
12344    // `ensure_room_for` / `reduce` is a shared contract for both
12345    // `short_hash` and `long_hash`; without seeding both, a
12346    // regression that only cleared short_hash would still pass.
12347    // Direct `pack_slot` + bucket write keeps the test focused on
12348    // the rebase mechanics and avoids dragging in the full
12349    // `insert_position` flow with its history/window setup.
12350    let early_abs = 1024usize;
12351    let early_packed = dfast.pack_slot(early_abs);
12352    assert_ne!(early_packed, DFAST_EMPTY_SLOT);
12353    dfast.short_hash[0] = early_packed;
12354    dfast.long_hash[0] = early_packed;
12355
12356    // Pick a trigger position that forces the first rebase. With
12357    // `position_base = 0`, the smallest `abs_pos` that fails the
12358    // `rel <= max_rel` test is `u32::MAX - DFAST_REBASE_GUARD_BAND
12359    // + 1`. After one `reduce(DFAST_REBASE_GUARD_BAND)` the base
12360    // advances by `DFAST_REBASE_GUARD_BAND`.
12361    let trigger_abs = (u32::MAX as usize) - (DFAST_REBASE_GUARD_BAND as usize) + 1;
12362    assert_eq!(dfast.position_base, 0);
12363    dfast.ensure_room_for(trigger_abs);
12364    assert_eq!(
12365        dfast.position_base, DFAST_REBASE_GUARD_BAND as usize,
12366        "rebase must advance position_base by DFAST_REBASE_GUARD_BAND"
12367    );
12368
12369    // The early entry at abs=1024 had packed slot 1025; the rebase
12370    // subtracts `DFAST_REBASE_GUARD_BAND` (= 2^30) from every slot.
12371    // 1025 <= 2^30 so the slot drops to the empty sentinel —
12372    // upstream zstd parity for `ZSTD_window_reduce`'s clamp-at-zero rule.
12373    // Verify BOTH tables — `reduce()` walks them in sequence.
12374    assert_eq!(
12375        dfast.short_hash[0], DFAST_EMPTY_SLOT,
12376        "pre-rebase short-hash entries below the reducer must become empty"
12377    );
12378    assert_eq!(
12379        dfast.long_hash[0], DFAST_EMPTY_SLOT,
12380        "pre-rebase long-hash entries below the reducer must become empty"
12381    );
12382
12383    // A fresh insert past the rebase boundary must round-trip:
12384    // pack to a non-sentinel value, then unpack back to the same
12385    // absolute position via `position_base + slot - 1`.
12386    let post_packed = dfast.pack_slot(trigger_abs);
12387    assert_ne!(post_packed, DFAST_EMPTY_SLOT);
12388    let unpacked = dfast.position_base + (post_packed as usize) - 1;
12389    assert_eq!(
12390        unpacked, trigger_abs,
12391        "post-rebase pack/unpack must round-trip the absolute position"
12392    );
12393}
12394
12395#[test]
12396fn dfast_sparse_skip_matching_backfills_previous_tail_for_consecutive_sparse_blocks() {
12397    let mut matcher = DfastMatchGenerator::new(1 << 22);
12398    let boundary_prefix = [0xFA, 0xFB, 0xFC];
12399    let boundary_suffix = [0xFD, 0xEE, 0xAD, 0xBE, 0xEF, 0x11, 0x22, 0x33];
12400
12401    let mut first = deterministic_high_entropy_bytes(0xA5A5_5A5A_C3C3_3C3C, 4096);
12402    let first_tail_start = first.len() - boundary_prefix.len();
12403    first[first_tail_start..].copy_from_slice(&boundary_prefix);
12404    matcher.add_data(first, |_| {});
12405    matcher.skip_matching(Some(true));
12406
12407    let mut second = deterministic_high_entropy_bytes(0xA5A5_5A5A_C3C3_3C3C, 4096);
12408    second[..boundary_suffix.len()].copy_from_slice(&boundary_suffix);
12409    matcher.add_data(second.clone(), |_| {});
12410    matcher.skip_matching(Some(true));
12411
12412    let mut third = boundary_prefix.to_vec();
12413    third.extend_from_slice(&boundary_suffix);
12414    third.extend_from_slice(b"-trailing-literals");
12415    matcher.add_data(third, |_| {});
12416
12417    let mut first_sequence = None;
12418    matcher.start_matching(|seq| {
12419        if first_sequence.is_some() {
12420            return;
12421        }
12422        first_sequence = Some(match seq {
12423            Sequence::Literals { literals } => (literals.len(), 0usize, 0usize),
12424            Sequence::Triple {
12425                literals,
12426                offset,
12427                match_len,
12428            } => (literals.len(), offset, match_len),
12429        });
12430    });
12431
12432    let (lit_len, offset, match_len) = first_sequence.expect("expected at least one sequence");
12433    assert_eq!(
12434        lit_len, 0,
12435        "expected immediate match from the prior sparse-skip boundary"
12436    );
12437    assert_eq!(
12438        offset,
12439        second.len() + boundary_prefix.len(),
12440        "expected match against backfilled first→second boundary start"
12441    );
12442    assert!(
12443        match_len >= DFAST_MIN_MATCH_LEN,
12444        "match length should satisfy dfast minimum match length"
12445    );
12446}
12447
12448#[test]
12449fn fastest_hint_iteration_23_sequences_reconstruct_source() {
12450    fn generate_data(seed: u64, len: usize) -> Vec<u8> {
12451        let mut state = seed;
12452        let mut data = Vec::with_capacity(len);
12453        for _ in 0..len {
12454            state = state
12455                .wrapping_mul(6364136223846793005)
12456                .wrapping_add(1442695040888963407);
12457            data.push((state >> 33) as u8);
12458        }
12459        data
12460    }
12461
12462    let i = 23u64;
12463    let len = (i * 89 % 16384) as usize;
12464    let mut data = generate_data(i, len);
12465    // Append a repeated slice so the fixture deterministically exercises
12466    // the match path (Sequence::Triple) instead of only literals.
12467    let repeat = data[128..256].to_vec();
12468    data.extend_from_slice(&repeat);
12469    data.extend_from_slice(&repeat);
12470
12471    let mut driver = MatchGeneratorDriver::new(1024 * 128, 1);
12472    driver.set_source_size_hint(data.len() as u64);
12473    driver.reset(CompressionLevel::Fastest);
12474    let mut space = driver.get_next_space();
12475    space[..data.len()].copy_from_slice(&data);
12476    space.truncate(data.len());
12477    driver.commit_space(space);
12478
12479    let mut rebuilt = Vec::with_capacity(data.len());
12480    let mut saw_triple = false;
12481    driver.start_matching(|seq| match seq {
12482        Sequence::Literals { literals } => rebuilt.extend_from_slice(literals),
12483        Sequence::Triple {
12484            literals,
12485            offset,
12486            match_len,
12487        } => {
12488            saw_triple = true;
12489            rebuilt.extend_from_slice(literals);
12490            assert!(offset > 0, "offset must be non-zero");
12491            assert!(
12492                offset <= rebuilt.len(),
12493                "offset must reference already-produced bytes: offset={} produced={}",
12494                offset,
12495                rebuilt.len()
12496            );
12497            let start = rebuilt.len() - offset;
12498            for idx in 0..match_len {
12499                let b = rebuilt[start + idx];
12500                rebuilt.push(b);
12501            }
12502        }
12503    });
12504
12505    // Whether THIS specific iteration produces a Triple depends on
12506    // the matcher's step-skip schedule (upstream zstd-shape kernel walks ip0
12507    // with kSearchStrength-driven stride growth) — the legacy
12508    // SuffixStore-based matcher iterated every position and always
12509    // hit short repeats, but the upstream zstd-shape kernel may skip over
12510    // them when the step has grown large by the time it reaches the
12511    // repeat region. The substance of this test is the
12512    // reconstruction assertion below; `saw_triple` was a legacy
12513    // tuning preference, not a correctness invariant.
12514    let _ = saw_triple;
12515    assert_eq!(rebuilt, data);
12516}
12517
12518#[test]
12519fn fast_levels_dispatch_per_level_hash_log_and_mls() {
12520    // Level 1 — upstream zstd `{ 19, 13, 14, 1, 7, 0, ZSTD_fast }` row:
12521    // window_log=19, hash_log=14, mls=7.
12522    let f1 = resolve_level_params(CompressionLevel::Level(1), None)
12523        .fast
12524        .unwrap();
12525    assert_eq!(f1.hash_log, 14);
12526    assert_eq!(f1.mls, 7);
12527    assert_eq!(f1.step_size, 2);
12528
12529    // Negative levels — upstream zstd row-0 ("base for negative"):
12530    // hash_log=13, mls=7. The 32 KiB table is L1d-resident (every
12531    // probe an L1 hit, vs an L2 access for a 64 KiB hash_log=14
12532    // table), and minMatch=7 drops short-distance 6-byte matches —
12533    // upstream zstd parity on both ratio and throughput.
12534    // step_size follows upstream zstd's formula: targetLength = -level,
12535    // step_size = (-level) + 1, giving 2..8 for L-1..L-7.
12536    for n in -7..=-1 {
12537        let f = resolve_level_params(CompressionLevel::Level(n), None)
12538            .fast
12539            .unwrap();
12540        assert_eq!(f.hash_log, 13, "Level({n}) fast_hash_log");
12541        assert_eq!(f.mls, 7, "Level({n}) fast_mls");
12542        let expected_step = ((-n) as usize) + 1;
12543        assert_eq!(f.step_size, expected_step, "Level({n}) fast_step_size");
12544    }
12545
12546    // Fastest + Uncompressed keep hash_log=14 / mls=6 (their own
12547    // tuning; not part of the negative-level upstream zstd ladder).
12548    let pf = resolve_level_params(CompressionLevel::Fastest, None);
12549    let ff = pf.fast.unwrap();
12550    assert_eq!(
12551        (pf.window_log, ff.hash_log, ff.mls, ff.step_size),
12552        (19, 14, 6, 2),
12553    );
12554    // Uncompressed keeps window_log=17 (no history references, smaller
12555    // decoder reservation); fast cParams same as negative-base row.
12556    let pu = resolve_level_params(CompressionLevel::Uncompressed, None);
12557    let fu = pu.fast.unwrap();
12558    assert_eq!(
12559        (pu.window_log, fu.hash_log, fu.mls, fu.step_size),
12560        (17, 14, 6, 2),
12561    );
12562}
12563
12564/// Exercise the actual driver wiring: for every Fast level, reset a
12565/// `MatchGeneratorDriver` and assert the inner `FastKernelMatcher`
12566/// observed the same `(hash_log, mls, step_size)` tuple that
12567/// `resolve_level_params` reports. Catches plumbing bugs — argument
12568/// reordering, stale step_size carried from a prior frame,
12569/// stuck-on-default values — that the parameter-only test above
12570/// would miss.
12571#[test]
12572fn fast_levels_driver_wiring_threads_cparams_into_inner_matcher() {
12573    let mut driver = MatchGeneratorDriver::new(64 * 1024, 1);
12574
12575    let fast_levels = [
12576        CompressionLevel::Level(1),
12577        CompressionLevel::Fastest,
12578        CompressionLevel::Uncompressed,
12579        CompressionLevel::Level(-1),
12580        CompressionLevel::Level(-2),
12581        CompressionLevel::Level(-3),
12582        CompressionLevel::Level(-4),
12583        CompressionLevel::Level(-5),
12584        CompressionLevel::Level(-6),
12585        CompressionLevel::Level(-7),
12586    ];
12587
12588    for &level in &fast_levels {
12589        let p = resolve_level_params(level, None);
12590        // Sanity: every level in the table above must resolve to a
12591        // Fast-strategy row — otherwise this test isn't testing what
12592        // it claims to test.
12593        assert_eq!(
12594            p.strategy_tag,
12595            super::strategy::StrategyTag::Fast,
12596            "{level:?} must resolve to Fast strategy",
12597        );
12598
12599        // Bounce through a non-Fast strategy first so the next
12600        // reset actually goes through the backend-switch path
12601        // (`MatchGeneratorDriver::new` / `simple_mut` recreate the
12602        // Fast variant via `FastKernelMatcher::with_params`). Without
12603        // this hop the loop would only ever stay in `BackendTag::Simple`
12604        // and exercise `FastKernelMatcher::reset` — leaving the
12605        // `with_params` wiring untested on the production path.
12606        // `Default` resolves to Dfast strategy (a non-Fast row),
12607        // which is enough to force the swap.
12608        crate::encoding::Matcher::reset(&mut driver, CompressionLevel::Default);
12609
12610        // Drive the production reset path (same code paths exercised
12611        // by FrameCompressor / StreamingEncoder).
12612        crate::encoding::Matcher::reset(&mut driver, level);
12613
12614        let f = p.fast.unwrap();
12615        let m = driver.simple_mut();
12616        assert_eq!(
12617            m.hash_log(),
12618            f.hash_log,
12619            "{level:?}: inner matcher hash_log mismatch — argument swap?",
12620        );
12621        assert_eq!(
12622            m.mls(),
12623            f.mls,
12624            "{level:?}: inner matcher mls mismatch — argument swap?",
12625        );
12626        assert_eq!(
12627            m.step_size(),
12628            f.step_size,
12629            "{level:?}: inner matcher step_size mismatch — stale value carried from prior reset?",
12630        );
12631    }
12632}
12633
12634/// Pins `hc.target_len` to the reference `cParams.targetLength` from
12635/// `clevels.h` table[0] (default — `srcSize > 256 KB`) across levels
12636/// 5-15. The reference's lazy outer loop treats `targetLength` as
12637/// `sufficient_len` — the "nice match" threshold that breaks the chain
12638/// walk as soon as a candidate reaches that length.
12639///
12640/// Levels 13-15 run btlazy2 in the reference and the hash-chain Lazy
12641/// parser here, but the reference `targetLength` (32) is the same nice-match
12642/// threshold for both finders, so we mirror it directly.
12643///
12644/// Asserts against the constant `clevels.h` table[0] `targetLength` column
12645/// (transcribed inline) — a pure-Rust in-tree test, no FFI dependency.
12646#[test]
12647fn lazy_band_target_len_matches_default_table() {
12648    // table[0] (srcSize > 256 KB) targetLength, levels 5..=15: the lazy
12649    // outer loop's nice-match (`sufficient_len`) threshold.
12650    let expected: [(i32, usize); 11] = [
12651        (5, 2),
12652        (6, 4),
12653        (7, 8),
12654        (8, 16),
12655        (9, 16),
12656        (10, 16),
12657        (11, 16),
12658        (12, 32),
12659        (13, 32),
12660        (14, 32),
12661        (15, 32),
12662    ];
12663    for (level, want) in expected {
12664        let params = resolve_level_params(CompressionLevel::Level(level), None);
12665        // L5 = greedy (Row backend → `row`); L6-15 = lazy (HashChain → `hc`).
12666        let target_len = params
12667            .hc
12668            .map(|hc| hc.target_len)
12669            .or_else(|| params.row.map(|row| row.target_len))
12670            .expect("lazy/greedy level carries hc or row config");
12671        assert_eq!(target_len, want, "L{level}: target_len must match table[0]");
12672    }
12673}
12674
12675/// Levels 13-15 mirror the reference btlazy2 window/hash/chain/search
12676/// budget from `clevels.h` table[0]: `search_depth == 1 << cParams.searchLog`
12677/// (16 / 32 / 64) plus `window_log` / `hash_log` / `chain_log` equal to the
12678/// reference `windowLog` / `hashLog` / `chainLog`. We run them on the
12679/// hash-chain Lazy parser rather than a binary-tree finder, so they do not
12680/// re-establish a strict ratio ladder above L12 on window-fitting inputs;
12681/// asserting the full row (not just `search_depth`) keeps the whole budget
12682/// aligned and guards every field against silent drift.
12683#[test]
12684fn upper_lazy_band_params_match_default_table() {
12685    // table[0] (srcSize > 256 KB), levels 13..=15 (btlazy2 budget):
12686    // (level, windowLog, hashLog, chainLog, search_depth = 1 << searchLog).
12687    let expected: [(i32, u8, usize, usize, usize); 3] = [
12688        (13, 22, 22, 22, 1 << 4),
12689        (14, 22, 23, 22, 1 << 5),
12690        (15, 22, 23, 23, 1 << 6),
12691    ];
12692    for (level, wlog, hlog, clog, sd) in expected {
12693        let params = resolve_level_params(CompressionLevel::Level(level), None);
12694        let hc = params.hc.unwrap();
12695        assert_eq!(hc.search_depth, sd, "L{level}: search_depth");
12696        assert_eq!(params.window_log, wlog, "L{level}: window_log");
12697        assert_eq!(hc.hash_log, hlog, "L{level}: hash_log");
12698        assert_eq!(hc.chain_log, clog, "L{level}: chain_log");
12699    }
12700}