structured_zstd/encoding/
match_generator.rs

1//! Matching algorithm used find repeated parts in the original data
2//!
3//! The Zstd format relies on finden repeated sequences of data and compressing these sequences as instructions to the decoder.
4//! A sequence basically tells the decoder "Go back X bytes and copy Y bytes to the end of your decode buffer".
5//!
6//! The task here is to efficiently find matches in the already encoded data for the current suffix of the not yet encoded data.
7
8use alloc::vec::Vec;
9// SIMD/CRC intrinsics now live in `crate::encoding::fastpath::*` where they
10// sit under per-CPU `#[target_feature]` umbrellas; no architecture-specific
11// intrinsic imports remain in this file.
12use super::CompressionLevel;
13use super::Matcher;
14use super::Sequence;
15use super::blocks::encode_offset_with_history;
16use super::bt::BtMatcher;
17#[cfg(test)]
18use super::cost_model::HC_MAX_LIT;
19use super::cost_model::{
20    HC_BITCOST_MULTIPLIER, HC_FORMAT_MINMATCH, HC_OPT_NODE_LEN, HC_OPT_NUM, HC_OPT_PRICE_ARENA_LEN,
21    HC_OPT_PRICE_STRIDE, HC_PREDEF_THRESHOLD, HcOptState, HcOptimalCostProfile,
22};
23#[cfg(test)]
24use super::cost_model::{HC_BLOCKSIZE_MAX, HC_MAX_LL, HC_MAX_ML, HC_MAX_OFF, HcOptPriceType};
25use super::dfast::DfastMatchGenerator;
26// FAST_HASH_FILL_STEP test-only re-export was tied to the legacy
27// SuffixStore MatchGenerator's interleaved hash-fill stride. The
28// upstream zstd-shape Fast kernel walks ip0 with kSearchStrength step-skip
29// acceleration instead, so the constant has no consumer in the
30// remaining live test set today.
31#[cfg(test)]
32use super::match_table::helpers::INCOMPRESSIBLE_SKIP_STEP;
33use super::match_table::helpers::MIN_MATCH_LEN;
34#[cfg(test)]
35use super::match_table::helpers::common_prefix_len;
36#[cfg(test)]
37use super::opt::ldm::HcRawSeq;
38use super::opt::ldm::{HcOptLdmState, HcRawSeqStore};
39use super::opt::types::{
40    HcCandidateQuery, HcOptimalNode, HcOptimalPlanBuffers, HcOptimalPlanState, HcOptimalSequence,
41    MatchCandidate,
42};
43use super::row::RowMatchGenerator;
44use super::simple::fast_matcher::{FAST_LEVEL_1_HASH_LOG, FAST_LEVEL_1_MLS, FastKernelMatcher};
45#[cfg(all(
46    test,
47    feature = "std",
48    target_arch = "aarch64",
49    target_endian = "little"
50))]
51use std::arch::is_aarch64_feature_detected;
52#[cfg(all(test, feature = "std", target_arch = "x86_64"))]
53use std::arch::is_x86_feature_detected;
54
55pub(crate) const DFAST_MIN_MATCH_LEN: usize = 5;
56// Bytes the dfast short hash reads (upstream zstd `mls = 5`). Seeding / lookahead
57// guards use it so a position is only short-hashed once its full 5-byte key
58// is in range.
59pub(crate) const DFAST_SHORT_HASH_LOOKAHEAD: usize = 5;
60pub(crate) const ROW_MIN_MATCH_LEN: usize = 5;
61// Upstream zstd `clevels.h:31` at level 3 large-input bucket sets
62// `hashLog = 17` (the long-hash table) and `chainLog = 16` (the
63// short-hash table — upstream zstd names this `chainTable` even though for
64// dfast it's used as a plain single-slot hash). Each table holds one
65// `U32` per slot; the upstream zstd overwrites on collision and recovers
66// compression quality via the inline `_search_next_long` retry
67// (after a short-hash hit, probes `hashLong[hl1]` at `ip + 1` and
68// keeps the longer match).
69//
70// We mirror that storage layout: single `u32` per bucket (no
71// `[u32; N]` array), `long_hash` sized `1 << DFAST_HASH_BITS` and
72// `short_hash` one bit smaller via `DFAST_SHORT_HASH_BITS_DELTA`.
73// Two-table footprint at Level 3: `2^17 × 4 + 2^16 × 4 = 768 KiB`,
74// exact upstream parity. The `_search_next_long` retry lives in
75// `DfastMatchGenerator::hash_candidate` (called via
76// `best_match`). Earlier revisions kept a
77// 4-slot bucket per hash position; that paid 4× the upstream zstd memory
78// without measurable ratio gain once the retry was in place.
79//
80// `dfast_hash_bits_for_window` still clamps the runtime long-hash
81// value to `[MIN_WINDOW_LOG, DFAST_HASH_BITS]`, so this const is the
82// upper bound rather than a fixed default.
83pub(crate) const DFAST_HASH_BITS: usize = 17;
84/// Difference between `long_hash_bits` and `short_hash_bits` —
85/// upstream zstd `hashLog - chainLog` is 1 at every dfast level (`clevels.h`
86/// level 2: 16-15=1; level 3: 17-16=1). The short hash is one bit
87/// smaller than the long hash so the per-bucket footprint matches
88/// upstream zstd sizing exactly.
89pub(crate) const DFAST_SHORT_HASH_BITS_DELTA: usize = 1;
90/// Sentinel value for an empty slot in the dfast hash tables. Real
91/// positions are stored as `(abs_pos - position_base + 1) as u32`, so
92/// `0` is reserved as the "empty" marker and a true relative offset
93/// of `0` never appears in the table. Mirrors the LDM table's
94/// `LdmEntry.offset == 0` convention (see `encoding/ldm/table.rs`)
95/// so both rebasing structures share
96/// one sentinel scheme.
97pub(crate) const DFAST_EMPTY_SLOT: u32 = 0;
98
99/// Guard band reserved above the high-water mark before triggering a
100/// rebase on the Dfast hash tables. When the next insert would push a
101/// relative offset above `u32::MAX - DFAST_REBASE_GUARD_BAND`, the
102/// table calls `reduce(GUARD_BAND)` to shift every slot down and
103/// advance `position_base` so future inserts stay inside the `u32`
104/// window. Same scheme as `encoding/ldm/table.rs`.
105pub(crate) const DFAST_REBASE_GUARD_BAND: u32 = 1u32 << 30;
106pub(crate) const DFAST_SKIP_SEARCH_STRENGTH: usize = 6;
107pub(crate) const DFAST_SKIP_STEP_GROWTH_INTERVAL: usize = 1 << DFAST_SKIP_SEARCH_STRENGTH;
108pub(crate) const DFAST_MAX_SKIP_STEP: usize = 8;
109pub(crate) const DFAST_INCOMPRESSIBLE_SKIP_STEP: usize = 16;
110pub(crate) const ROW_HASH_BITS: usize = 20;
111pub(crate) const ROW_LOG: usize = 5;
112pub(crate) const ROW_SEARCH_DEPTH: usize = 16;
113pub(crate) const ROW_TARGET_LEN: usize = 48;
114pub(crate) const ROW_TAG_BITS: usize = 8;
115pub(crate) const ROW_EMPTY_SLOT: u32 = u32::MAX;
116pub(crate) const ROW_HASH_KEY_LEN: usize = 4;
117// HASH_MIX_PRIME now lives in `crate::encoding::fastpath::scalar`; the four
118// per-CPU `hash_mix_u64` variants share it via that module.
119// HC_PRIME3BYTES / HC_PRIME4BYTES moved to match_table::storage
120// alongside the hash helpers in Phase 1e Stage A. Only the test
121// module references the constants directly (production code goes
122// through `MatchTable::hash_value_with_mls`).
123#[cfg(test)]
124use super::match_table::storage::{HC_PRIME3BYTES, HC_PRIME4BYTES};
125
126// HC_HASH_LOG / HC_CHAIN_LOG / HC3_HASH_LOG / HC_EMPTY live on the
127// shared storage module so MatchTable methods can reference them
128// without pulling in this module. Re-imported here so existing
129// macros / configs / tests keep their unqualified names.
130#[cfg(test)]
131use super::match_table::storage::HC_EMPTY;
132use super::match_table::storage::HC3_HASH_LOG;
133// HC_HASH_LOG / HC_CHAIN_LOG feed the test-only `HC_CONFIG` default.
134#[cfg(test)]
135use super::match_table::storage::{HC_CHAIN_LOG, HC_HASH_LOG};
136// HC3_MAX_OFFSET moved to encoding::bt alongside the hash3 candidate
137// probe macro that consumes it; the macro references it via the
138// fully-qualified `$crate::encoding::bt::HC3_MAX_OFFSET` path so this
139// module no longer needs a local import.
140const HC_SEARCH_DEPTH: usize = 16;
141// HC_MIN_MATCH_LEN moved to encoding::hc; re-imported here so
142// existing references compile unchanged.
143use super::hc::HC_MIN_MATCH_LEN;
144const HC_OPT_MIN_MATCH_LEN: usize = HC_FORMAT_MINMATCH;
145const HC_TARGET_LEN: usize = 48;
146
147// MAX_HC_SEARCH_DEPTH moved to encoding::hc alongside chain_candidates.
148use super::hc::MAX_HC_SEARCH_DEPTH;
149
150// `Strategy` and `StrategyTag` live in `crate::encoding::strategy`.
151// The driver carries a `StrategyTag` field set at `reset()` and
152// dispatches each block into a monomorphised `compress_block::<S>`
153// per concrete strategy.
154
155/// Bundled tuning knobs for the hash-chain matcher. Using a typed config
156/// instead of positional `usize` args eliminates parameter-order hazards.
157#[derive(Copy, Clone, PartialEq, Eq)]
158struct HcConfig {
159    hash_log: usize,
160    chain_log: usize,
161    search_depth: usize,
162    target_len: usize,
163    /// Binary-tree finder hash width (upstream zstd `mls = BOUNDED(4, minMatch, 6)`),
164    /// carried explicitly per level so it is NOT inferred from `target_len`
165    /// (a `target_length` override must not silently flip the finder between
166    /// 5- and 4-byte hashing). Only the BT body reads it; HC/lazy levels keep
167    /// it at 4 (their `hash_position` is always 4-byte). 5 for the
168    /// minMatch=5 BT levels (btlazy2 + btopt L16), 4 elsewhere.
169    search_mls: usize,
170}
171
172#[derive(Copy, Clone, PartialEq, Eq)]
173pub(crate) struct RowConfig {
174    pub(crate) hash_bits: usize,
175    pub(crate) row_log: usize,
176    pub(crate) search_depth: usize,
177    pub(crate) target_len: usize,
178    /// Upstream zstd `cParams.minMatch` for the row matcher: the regular-search
179    /// acceptance floor (a row candidate must extend to >= `mls` bytes).
180    /// The C-like advanced API surfaces this as the row min-match knob.
181    /// `ROW_MIN_MATCH_LEN` (5) is the default; the row hash key width stays
182    /// 4 bytes (an internal detail), so this only tunes the acceptance
183    /// floor, not the candidate hash distribution.
184    pub(crate) mls: usize,
185}
186
187// Only used as the default HashChain config when the test-only parse×search
188// override pairs a level with a backend its native row doesn't populate.
189#[cfg(test)]
190const HC_CONFIG: HcConfig = HcConfig {
191    hash_log: HC_HASH_LOG,
192    chain_log: HC_CHAIN_LOG,
193    search_depth: HC_SEARCH_DEPTH,
194    target_len: HC_TARGET_LEN,
195    search_mls: 4,
196};
197
198/// Base HashChain config synthesized when a public-parameter strategy
199/// override ([`super::parameters`]) routes a level to the HC / BT
200/// backend whose native level row didn't populate `hc` (e.g. forcing
201/// `Strategy::Lazy2` onto a level the table resolves to Fast). Mirrors
202/// the mid-band lazy defaults; the per-knob overrides then refine it.
203const HC_OVERRIDE_DEFAULT: HcConfig = HcConfig {
204    hash_log: super::match_table::storage::HC_HASH_LOG,
205    chain_log: super::match_table::storage::HC_CHAIN_LOG,
206    search_depth: HC_SEARCH_DEPTH,
207    target_len: HC_TARGET_LEN,
208    search_mls: 4,
209};
210
211const BTULTRA2_HC_CONFIG: HcConfig = HcConfig {
212    hash_log: 24,
213    chain_log: 24,
214    search_depth: 512,
215    target_len: 256,
216    search_mls: 4,
217};
218
219const BTULTRA2_HC_CONFIG_L22: HcConfig = HcConfig {
220    hash_log: 25,
221    chain_log: 27,
222    search_depth: 512,
223    target_len: 999,
224    search_mls: 4,
225};
226
227const BTULTRA2_HC_CONFIG_L22_256K: HcConfig = HcConfig {
228    hash_log: 19,
229    chain_log: 19,
230    search_depth: 1 << 13,
231    target_len: 999,
232    search_mls: 4,
233};
234
235const BTULTRA2_HC_CONFIG_L22_128K: HcConfig = HcConfig {
236    hash_log: 17,
237    chain_log: 18,
238    search_depth: 1 << 11,
239    target_len: 999,
240    search_mls: 4,
241};
242
243const BTULTRA2_HC_CONFIG_L22_16K: HcConfig = HcConfig {
244    hash_log: 15,
245    chain_log: 15,
246    search_depth: 1 << 10,
247    target_len: 999,
248    search_mls: 4,
249};
250
251// Default Row config: only used by tests and the test-only parse×search
252// override (production greedy L5 carries its own `ROW_L5`).
253#[cfg(test)]
254const ROW_CONFIG: RowConfig = RowConfig {
255    hash_bits: ROW_HASH_BITS,
256    row_log: ROW_LOG,
257    search_depth: ROW_SEARCH_DEPTH,
258    target_len: ROW_TARGET_LEN,
259    mls: ROW_MIN_MATCH_LEN,
260};
261
262// Level-5 greedy is the ONLY strategy routed to the Row backend
263// (`StrategyTag::backend`: greedy -> Row; lazy / btopt / btultra* ->
264// HashChain), so it is the only level whose `row:` field is read. The upstream zstd
265// `clevels.h` default row (srcSize > 256 KB) for level 5 is searchLog=3,
266// targetLength=2, from which the row matcher derives:
267//   rowLog       = clamp(searchLog, 4, 6) = 4
268//   search_depth = 1 << min(searchLog, rowLog) = 8   (= nbAttempts)
269//   target_len   = targetLength = 2                  (nice-match early-out)
270// The shared `ROW_CONFIG` (row_log=5, search_depth=16, target_len=48) ran a
271// level-12-grade search here: 16 slots per row, never early-exiting until a
272// 48-byte match. That exhaustive walk was the dominant cost in greedy L5's
273// encode-speed regression vs FFI. `hash_bits` matches upstream zstd's
274// `ZSTD_getCParams(5, .., 0).hashLog` = 19 (verified via
275// `cparams_check 5`), so the row table is the same width as upstream's
276// (2^19 slots); the previous `ROW_HASH_BITS` (20) doubled both row tables vs
277// upstream, the dominant peak-memory excess on the greedy band.
278const ROW_L5: RowConfig = RowConfig {
279    hash_bits: 19,
280    row_log: 4,
281    search_depth: 8,
282    target_len: 2,
283    mls: ROW_MIN_MATCH_LEN,
284};
285
286// Upstream zstd `clevels.h` unbounded defaults for the lazy band, verified via
287// `ZSTD_getCParams(level, 0, 0)`:
288//   L6  { w21 c18 h19 s3 mml5 t4  lazy  } → rowLog 4, depth 1<<3 = 8
289//   L7  { w21 c19 h20 s4 mml5 t8  lazy  } → rowLog 4, depth 16
290//   L8  { w21 c19 h20 s4 mml5 t16 lazy2 } → rowLog 4, depth 16
291//   L9  { w22 c20 h21 s4 mml5 t16 lazy2 } → rowLog 4, depth 16
292//   L10 { w22 c21 h22 s5 mml5 t16 lazy2 } → rowLog 5, depth 32
293//   L11 { w22 c21 h22 s6 mml5 t16 lazy2 } → rowLog 6, depth 64
294//   L12 { w22 c22 h23 s6 mml5 t32 lazy2 } → rowLog 6, depth 64
295// `rowLog = clamp(searchLog, 4, 6)`, `depth = 1 << min(searchLog, rowLog)`
296// (same derivation as `ROW_L5` above). `hash_bits` carries the upstream zstd
297// `hashLog`; the hinted-source clamp in `configure` caps it by the window
298// exactly like the upstream zstd `ZSTD_adjustCParams` path.
299const ROW_L6: RowConfig = RowConfig {
300    hash_bits: 19,
301    row_log: 4,
302    search_depth: 8,
303    target_len: 4,
304    mls: ROW_MIN_MATCH_LEN,
305};
306const ROW_L7: RowConfig = RowConfig {
307    hash_bits: 20,
308    row_log: 4,
309    search_depth: 16,
310    target_len: 8,
311    mls: ROW_MIN_MATCH_LEN,
312};
313const ROW_L8: RowConfig = RowConfig {
314    hash_bits: 20,
315    row_log: 4,
316    search_depth: 16,
317    target_len: 16,
318    mls: ROW_MIN_MATCH_LEN,
319};
320const ROW_L9: RowConfig = RowConfig {
321    hash_bits: 21,
322    row_log: 4,
323    search_depth: 16,
324    target_len: 16,
325    mls: ROW_MIN_MATCH_LEN,
326};
327const ROW_L10: RowConfig = RowConfig {
328    hash_bits: 22,
329    row_log: 5,
330    search_depth: 32,
331    target_len: 16,
332    mls: ROW_MIN_MATCH_LEN,
333};
334const ROW_L11: RowConfig = RowConfig {
335    hash_bits: 22,
336    row_log: 6,
337    search_depth: 64,
338    target_len: 16,
339    mls: ROW_MIN_MATCH_LEN,
340};
341const ROW_L12: RowConfig = RowConfig {
342    hash_bits: 23,
343    row_log: 6,
344    search_depth: 64,
345    target_len: 32,
346    mls: ROW_MIN_MATCH_LEN,
347};
348
349/// Per-level Double-Fast hash sizing, mirroring the upstream zstd `clevels.h` columns
350/// (config-driven, not a hardcoded constant): `long_hash_log` =
351/// `cParams.hashLog` (the long 8-byte hash table), `short_hash_log` =
352/// `cParams.chainLog` (the short hash table dfast repurposes as its
353/// secondary index). Only the Dfast backend reads it, so non-dfast level
354/// rows carry `dfast: None`. `minMatch` stays the upstream zstd-fixed `5`
355/// (`DFAST_MIN_MATCH_LEN`, used in const contexts).
356#[derive(Copy, Clone, PartialEq, Eq)]
357struct DfastConfig {
358    long_hash_log: u8,
359    short_hash_log: u8,
360}
361
362// Upstream zstd clevels.h default row (srcSize > 256 KB): L3 {hashLog 17, chainLog 16},
363// L4 {hashLog 18, chainLog 18}.
364const DFAST_L3: DfastConfig = DfastConfig {
365    long_hash_log: 17,
366    short_hash_log: 16,
367};
368const DFAST_L4: DfastConfig = DfastConfig {
369    long_hash_log: 18,
370    short_hash_log: 18,
371};
372
373/// Per-level Fast-strategy tuning, only consumed by the `FastKernelMatcher`
374/// (Simple backend): `hash_log` = upstream zstd `cParams.hashLog`, `mls` = upstream zstd
375/// `cParams.minMatch` (4..=8), `step_size` = upstream zstd `stepSize`. Carried as
376/// `LevelParams.fast` (`Some` only on Fast level rows; `None` elsewhere).
377#[derive(Copy, Clone, PartialEq, Eq)]
378struct FastConfig {
379    hash_log: u32,
380    mls: u32,
381    step_size: usize,
382}
383
384const FAST_L1: FastConfig = FastConfig {
385    hash_log: 14,
386    // Tier-0 (srcSize > 256 KiB) `cParams.minMatch`. Upstream zstd selects the
387    // Level-1 row from a 4-way srcSize-tiered table (`ZSTD_getCParams_internal`
388    // → `ZSTD_defaultCParameters[tableID][1]`), and minMatch shrinks for
389    // smaller inputs: 7 (>256 KiB) / 6 (16..256 KiB) / 5 (<=16 KiB). The base
390    // here is the tier-0 value; `fast_l1_mls_for_source_size` lowers it per the
391    // tier in `adjust_params_for_source_size`.
392    mls: 7,
393    step_size: 2,
394};
395const FAST_L2: FastConfig = FastConfig {
396    hash_log: 16,
397    mls: 6,
398    step_size: 2,
399};
400
401/// Resolved tuning parameters for a compression level. The
402/// [`StrategyTag`] is the single source of truth for the backend
403/// family and the compile-time strategy consts; the runtime
404/// [`BackendTag`] used by the driver dispatcher is derived via
405/// [`StrategyTag::backend`] so the two cannot drift.
406#[derive(Copy, Clone, PartialEq, Eq)]
407struct LevelParams {
408    strategy_tag: super::strategy::StrategyTag,
409    /// Decoupled search-method axis. Independent of `strategy_tag`'s
410    /// parse half: a level can pair any parse (greedy / lazy depth via
411    /// `lazy_depth`) with any search backend here. Defaults to the
412    /// historical pairing (`strategy_tag.search()`) but is overridable
413    /// per level so the parse×search matrix can be swept and tuned.
414    search: super::strategy::SearchMethod,
415    window_log: u8,
416    lazy_depth: u8,
417    /// Per-strategy tuning. Exactly one is `Some` on each level row, matching
418    /// `strategy_tag`'s backend, so the table self-documents which knobs a
419    /// level actually consumes (the others are `None`, not dead placeholders):
420    /// `fast` for the Fast/Simple backend, `dfast` for Double-Fast, `hc` for
421    /// the HashChain (lazy / btopt / btultra*) backend, `row` for the Row
422    /// (greedy L5) backend.
423    fast: Option<FastConfig>,
424    dfast: Option<DfastConfig>,
425    hc: Option<HcConfig>,
426    row: Option<RowConfig>,
427}
428
429impl LevelParams {
430    /// Backend family (storage variant) for the driver dispatcher.
431    /// Derived from the decoupled `search` axis so a level can route to
432    /// a different search backend than its `strategy_tag` historically
433    /// implied.
434    fn backend(&self) -> super::strategy::BackendTag {
435        self.search.backend()
436    }
437
438    /// Parse mode derived from the decoupled `search` axis: the binary-tree
439    /// search path carries `ParseMode::Optimal`; every other search backend
440    /// derives greedy/lazy/lazy2 from `lazy_depth`. Reading `search` (not the
441    /// strategy tag) keeps the parse×search decoupling complete even when a
442    /// level whose tag is `Bt*` is overridden to a non-BT search backend.
443    fn parse(&self) -> super::strategy::ParseMode {
444        match self.search {
445            super::strategy::SearchMethod::BinaryTree => super::strategy::ParseMode::Optimal,
446            _ => super::strategy::ParseMode::from_lazy_depth(self.lazy_depth),
447        }
448    }
449
450    /// Cheap fingerprint pre-splitter level (the C-like `blockSplitterLevel`):
451    /// the EFFECTIVE upstream `ZSTD_splitBlock` level that
452    /// `ZSTD_optimalBlockSize` dispatches, i.e. `splitLevels[strategy] - 2`
453    /// (clamped at 0), NOT the raw `splitLevels[]` value. `split_level == 0`
454    /// routes to the cheap from-borders heuristic; `1..=4` to byChunks with
455    /// internal sampling level `split_level - 1`. See the body for the
456    /// per-strategy tier table and why the raw-table mapping was wrong.
457    fn pre_split(&self) -> Option<u8> {
458        use super::strategy::StrategyTag;
459        // Effective upstream `ZSTD_splitBlock` level = `splitLevels[strat] - 2`
460        // (clamped at 0). Upstream `splitLevels[] = {0,0,1,2,2,3,3,4,4,4}` then
461        // subtracts 2 before dispatch, so the byChunks sampling tier is two
462        // steps coarser than the raw table: greedy/lazy(d1)=0 (from-borders),
463        // lazy2/btlazy2=1 (byChunks rate 43), btopt+=2 (byChunks rate 11).
464        // An earlier version mirrored the RAW table AND bumped lazy2 to the
465        // rate-1 full scan (split 4) to dodge a periodic-input phantom-split —
466        // that ran the pre-splitter at up to 43x upstream's sampling cost
467        // (~87% of L9 encode time on the decode corpus). Per the drop-in
468        // contract ratio only needs to stay <= upstream, so matching upstream's
469        // sampling tier (and accepting upstream's identical over-split on
470        // periodic input) is the dominant large-input encode-speed win.
471        Some(match self.strategy_tag {
472            // splitLevels 0/1 -> 0: upstream does not pre-split fast/dfast at
473            // all; from-borders is the cheapest stand-in and rarely splits.
474            StrategyTag::Fast | StrategyTag::Dfast => 0,
475            // greedy / lazy(depth 1): splitLevels 2 -> 0 (from-borders).
476            StrategyTag::Greedy => 0,
477            StrategyTag::Lazy => {
478                if self.lazy_depth >= 2 {
479                    1 // lazy2: splitLevels 3 -> 1 (byChunks rate 43)
480                } else {
481                    0 // lazy depth 1: splitLevels 2 -> 0 (from-borders)
482                }
483            }
484            StrategyTag::Btlazy2 => 1, // splitLevels 3 -> 1 (byChunks rate 43)
485            StrategyTag::BtOpt | StrategyTag::BtUltra | StrategyTag::BtUltra2 => 2,
486        })
487    }
488}
489
490/// Apply the public-parameter per-knob overrides (#27) onto the
491/// level-resolved [`LevelParams`], in place. Runs in [`Matcher::reset`]
492/// after the level params are computed and before backend selection, so
493/// a strategy override re-routes the backend uniformly. An all-`None`
494/// override is a no-op the caller skips via
495/// [`super::parameters::ParamOverrides::is_empty`], keeping the default
496/// level geometry byte-identical.
497fn apply_param_overrides(params: &mut LevelParams, ov: &super::parameters::ParamOverrides) {
498    use super::strategy::SearchMethod;
499
500    // 1. Strategy override re-derives tag / search / lazy depth.
501    if let Some(strategy) = ov.strategy {
502        let tag = strategy.tag();
503        params.strategy_tag = tag;
504        params.search = tag.search();
505        params.lazy_depth = strategy.lazy_depth();
506    }
507
508    // 2. Ensure the active backend's config row exists (synthesize a
509    //    default when a strategy override moved off the native row).
510    match params.search {
511        SearchMethod::Fast => {
512            params.fast.get_or_insert(FAST_L1);
513        }
514        SearchMethod::DoubleFast => {
515            params.dfast.get_or_insert(DFAST_L3);
516        }
517        SearchMethod::RowHash => {
518            params.row.get_or_insert(ROW_L5);
519        }
520        SearchMethod::HashChain | SearchMethod::BinaryTree => {
521            // A `Btlazy2` strategy override moved off a non-HC row needs the
522            // BT 5-byte finder hash (upstream zstd minMatch 5); other synthesized HC
523            // rows keep the 4-byte default. An explicit `min_match` override
524            // below refines this further.
525            params.hc.get_or_insert(HcConfig {
526                search_mls: if matches!(params.strategy_tag, super::strategy::StrategyTag::Btlazy2)
527                {
528                    5
529                } else {
530                    HC_OVERRIDE_DEFAULT.search_mls
531                },
532                ..HC_OVERRIDE_DEFAULT
533            });
534        }
535    }
536
537    // 3. window_log (bounds-checked at <= 30 by the builder).
538    if let Some(window_log) = ov.window_log {
539        params.window_log = window_log;
540    }
541
542    // 4. Per-backend numeric knobs map into the active config, mirroring
543    //    the upstream zstd `cParams` -> matcher translation documented on each
544    //    config struct.
545    match params.search {
546        SearchMethod::Fast => {
547            if let Some(fast) = params.fast.as_mut() {
548                if let Some(hash_log) = ov.hash_log {
549                    fast.hash_log = hash_log;
550                }
551                if let Some(min_match) = ov.min_match {
552                    fast.mls = min_match;
553                }
554            }
555        }
556        SearchMethod::DoubleFast => {
557            if let Some(dfast) = params.dfast.as_mut() {
558                // hashLog -> long table, chainLog -> short table (the
559                // dfast secondary index). Both bounds-checked <= 30, so
560                // the `u8` casts are lossless.
561                if let Some(hash_log) = ov.hash_log {
562                    dfast.long_hash_log = hash_log as u8;
563                }
564                if let Some(chain_log) = ov.chain_log {
565                    dfast.short_hash_log = chain_log as u8;
566                }
567            }
568        }
569        SearchMethod::RowHash => {
570            if let Some(row) = params.row.as_mut() {
571                // Row hash-table width override (mirrors dfast `long_hash_log`
572                // / hc `hash_log`). Row has no separate chain table — the
573                // per-row depth comes from `search_log` below — so only
574                // `hash_log` maps here; `chain_log` has no Row analogue.
575                if let Some(hash_log) = ov.hash_log {
576                    row.hash_bits = hash_log as usize;
577                }
578                if let Some(search_log) = ov.search_log {
579                    // Upstream zstd: rowLog = clamp(searchLog, 4, 6);
580                    //        nbAttempts = 1 << min(searchLog, rowLog).
581                    let row_log = (search_log as usize).clamp(4, 6);
582                    row.row_log = row_log;
583                    row.search_depth = 1usize << (search_log as usize).min(row_log);
584                }
585                if let Some(target_length) = ov.target_length {
586                    row.target_len = target_length as usize;
587                }
588                if let Some(min_match) = ov.min_match {
589                    row.mls = min_match as usize;
590                }
591            }
592        }
593        SearchMethod::HashChain | SearchMethod::BinaryTree => {
594            if let Some(hc) = params.hc.as_mut() {
595                if let Some(hash_log) = ov.hash_log {
596                    hc.hash_log = hash_log as usize;
597                }
598                if let Some(chain_log) = ov.chain_log {
599                    hc.chain_log = chain_log as usize;
600                }
601                if let Some(search_log) = ov.search_log {
602                    hc.search_depth = 1usize << search_log;
603                }
604                if let Some(target_length) = ov.target_length {
605                    hc.target_len = target_length as usize;
606                }
607                if let Some(min_match) = ov.min_match {
608                    // Upstream zstd `mls = BOUNDED(4, cParams.minMatch, 6)`: a BT
609                    // min_match override maps into the finder hash width. Only
610                    // the BT body reads `search_mls`; HC/lazy keep 4-byte
611                    // hashing regardless, so this is a no-op for them.
612                    hc.search_mls = (min_match as usize).clamp(4, 6);
613                }
614            }
615        }
616    }
617}
618
619/// Map the resolved runtime strategy to the upstream zstd LDM strategy ordinal
620/// (1..=9) that [`super::ldm::params::LdmParams::adjust_for`] expects.
621/// The collapsed `Lazy` tag splits on `lazy_depth` (lazy = 4, lazy2 = 5).
622#[cfg(feature = "hash")]
623fn ldm_strategy_ordinal(tag: super::strategy::StrategyTag, lazy_depth: u8) -> u32 {
624    use super::strategy::StrategyTag;
625    match tag {
626        StrategyTag::Fast => 1,
627        StrategyTag::Dfast => 2,
628        StrategyTag::Greedy => 3,
629        StrategyTag::Lazy => {
630            if lazy_depth >= 2 {
631                5
632            } else {
633                4
634            }
635        }
636        // Upstream zstd `ZSTD_btlazy2` ordinal.
637        StrategyTag::Btlazy2 => 6,
638        StrategyTag::BtOpt => 7,
639        StrategyTag::BtUltra => 8,
640        StrategyTag::BtUltra2 => 9,
641    }
642}
643
644/// `ceil(log2(size))` of a source-size hint, with a zero hint floored to
645/// [`MIN_WINDOW_LOG`]. This is the single quantization every hint-dependent
646/// matcher parameter is derived from: the window-log cap, the HC / Fast hash
647/// and chain widths, the Dfast / Row table widths, the L22 config buckets, and
648/// the Fast attach-vs-copy cutoff. Two hints sharing this value resolve to the
649/// identical matcher shape, which is why it (not the raw byte count) keys the
650/// primed-dictionary snapshot — see [`PrimedKey`]. Operates on the full `u64`
651/// so callers comparing a hint against a cutoff get the same bucketed decision
652/// here and at the driver, with no `as usize` truncation on 32-bit targets.
653pub(crate) fn source_size_ceil_log(size: u64) -> u8 {
654    if size == 0 {
655        MIN_WINDOW_LOG
656    } else {
657        (64 - (size - 1).leading_zeros()) as u8
658    }
659}
660
661/// Attach-vs-copy cutoff for the Fast strategy, as a ceil-log bucket: a hint at
662/// or below `2^this` (or unknown, `None`) ATTACHES the dictionary (a separate
663/// immutable table scanned in place via the borrowed dual-base kernel); a larger
664/// hint would COPY it into the live table.
665///
666/// We set this to `31` so every dictionary source up to 2 GiB attaches,
667/// diverging from upstream zstd's 8 KiB `ZSTD_shouldAttachDict` cutoff ON
668/// PURPOSE: upstream copy mode copies the small CDict TABLES into the cctx and
669/// still scans the input in place, but our flat-history copy path memmoves the
670/// whole INPUT into history every frame (profiled at 30% `__memmove` + 14%
671/// `__memset` on a reused 1 MiB dict encode). Attach mode scans the caller's
672/// input in place with the dict as a separate prefix base, so it is strictly
673/// faster for every frame size here (measured: 1 MiB dict frame 167 us -> 52 us,
674/// 0.42x of C; 10 KiB 20.4 us -> 4.4 us, 0.17x of C). The dual-base kernel
675/// carries `window_low`, so over-window inputs stay in-window and C-decodable.
676///
677/// `31` is also the largest bucket the borrowed kernel can attach: it stores
678/// virtual positions as `u32` (`cur_abs as u32`), so the maximum attached source
679/// `1 << 31` (plus the dict prefix) stays below `u32::MAX`; the next bucket `32`
680/// (4 GiB) would wrap that arithmetic. Sources past 2 GiB therefore fall back to
681/// copy mode — rare in practice, and the relative copy cost shrinks as the
682/// source grows. Per the drop-in-not-binary-parity contract, we make this match
683/// decision ourselves.
684/// Shared by `reset` (records the mode in the primed-snapshot key) and
685/// `prime_with_dictionary` (acts on it).
686pub(crate) const FAST_ATTACH_DICT_CUTOFF_LOG: u8 = 31;
687
688/// Largest dictionary region (bytes) the Fast attach path can index. The tagged
689/// dict table packs each position into `32 - DICT_TAG_BITS` (= 24) bits, so a
690/// region past `2^24` (16 MiB) would overflow the packed position. Dictionaries
691/// this large fall back to COPY mode, whose live table stores full `u32`
692/// positions and handles them. The size hint set on dict load equals the actual
693/// dict content length, so the attach-vs-copy decision (and the matching
694/// snapshot-key / epoch bits) can gate on it consistently at reset time.
695pub(crate) const MAX_FAST_ATTACH_DICT_REGION: usize = 1 << 24;
696
697/// Dfast counterpart of [`FAST_ATTACH_DICT_CUTOFF_LOG`]: upstream zstd
698/// `ZSTD_dictMatchState` attach cutoff for the double-fast strategy is 16 KiB
699/// (`2^14`), so small / unknown-size inputs ATTACH (separate immutable dict
700/// long+short tables + dual-probe in `start_matching_fast_loop`) and larger
701/// known-size inputs COPY (re-prime the dict into the live tables, where the
702/// dense scan matches it as window history). The attach build also self-gates
703/// on `use_fast_loop` inside `skip_matching_for_dict_attach` — only the
704/// fast-loop levels (L3 / Default / L0) carry the dual-probe.
705const DFAST_ATTACH_DICT_CUTOFF_LOG: u8 = 14;
706
707/// `ZSTD_dictMatchState` attach cutoff for the Row (greedy/lazy) strategy is
708/// 32 KiB (`2^15`, upstream zstd `attachDictSizeCutoffs`): small / unknown-size inputs
709/// ATTACH the dict into the separate immutable row index (bounded dual-probe in
710/// `row_candidate_rl`), larger known-size inputs dense-COPY into the live rows.
711const ROW_ATTACH_DICT_CUTOFF_LOG: u8 = 15;
712
713/// 32 KiB (`2^15`, upstream zstd `attachDictSizeCutoffs[ZSTD_lazy2]`): small /
714/// unknown-size inputs ATTACH the dict as a separate hash-chain dms (the dual
715/// search in `find_best_match` walks the live input chain + the dms), larger
716/// known-size inputs dense-COPY (merge the dict into the live chain and search
717/// the one combined chain).
718const HC_ATTACH_DICT_CUTOFF_LOG: u8 = 15;
719
720/// BT/optimal attach cutoff for `btlazy2` + `btopt`: 32 KiB (`2^15`, upstream
721/// zstd `attachDictSizeCutoffs[ZSTD_btlazy2]` == `[ZSTD_btopt]`). Small /
722/// unknown-size inputs ATTACH the dict as a separate DUBT dms; larger known-size
723/// inputs COPY the dict into the LIVE binary tree (upstream zstd
724/// `ZSTD_resetCCtx_byCopyingCDict`).
725const BT_OPT_ATTACH_DICT_CUTOFF_LOG: u8 = 15;
726
727/// BT/optimal attach cutoff for `btultra` + `btultra2`: 8 KiB (`2^13`, upstream
728/// zstd `attachDictSizeCutoffs[ZSTD_btultra]` == `[ZSTD_btultra2]`). The deepest
729/// parses copy the dict into the live tree past a much smaller source than the
730/// `btopt` tier, matching upstream's per-strategy cutoff table.
731const BT_ULTRA_ATTACH_DICT_CUTOFF_LOG: u8 = 13;
732
733// Source-size cap for the dfast hash bits when a size hint is present: a tiny
734// input needs no larger hash than its window. The upstream zstd `cParams.hashLog` /
735// `chainLog` (from `DfastConfig`) caps it from above at the call site.
736fn dfast_hash_bits_for_window(max_window_size: usize) -> usize {
737    let window_log = (usize::BITS - 1 - max_window_size.leading_zeros()) as usize;
738    window_log.max(MIN_WINDOW_LOG as usize)
739}
740
741fn row_hash_bits_for_window(max_window_size: usize) -> usize {
742    // Upstream zstd `ZSTD_adjustCParams_internal` cap: `hashLog <= windowLog + 1`.
743    // The `+ 1` is load-bearing for L12, whose upstream zstd hashLog (23) exceeds
744    // its windowLog (22) — a plain `windowLog` cap would shrink the L12
745    // table on EVERY hinted reset and split primed snapshots between
746    // hinted and unhinted frames that resolve to the identical geometry.
747    // No constant upper clamp: the old `ROW_HASH_BITS` (20) ceiling
748    // predates the lazy band moving onto Row (L9-12 carry upstream zstd hashLog
749    // 21-23).
750    let window_log = (usize::BITS - 1 - max_window_size.leading_zeros()) as usize;
751    (window_log + 1).max(MIN_WINDOW_LOG as usize)
752}
753
754/// `floor(log2(window))` for the HashChain table-log cap (upstream zstd
755/// `ZSTD_adjustCParams_internal`). The caller clamps the level's `hash_log` /
756/// `chain_log` from above with this so a small hinted input doesn't allocate the
757/// full level's tables.
758fn hc_hash_bits_for_window(max_window_size: usize) -> usize {
759    let window_log = (usize::BITS - 1 - max_window_size.leading_zeros()) as usize;
760    window_log.max(MIN_WINDOW_LOG as usize)
761}
762
763/// Parameter table for numeric compression levels 1–22.
764///
765/// Each entry maps a zstd compression level to the best-available matcher
766/// backend and tuning knobs. High levels map to dedicated parse modes:
767/// btopt (16-17), btultra (18), btultra2 (19-22) — matching upstream zstd
768/// `clevels.h` (level 19 is `ZSTD_btultra2`, not plain btultra).
769///
770/// Index 0 = level 1, index 21 = level 22.
771#[rustfmt::skip]
772const LEVEL_TABLE: [LevelParams; 22] = [
773    // Exactly one of fast/dfast/hc/row is Some per row, matching the strategy
774    // backend; the rest are None (not dead placeholders).
775    // Lvl  Strategy       wlog  lazy  per-strategy config
776    // ---  -------------- ----  ----  -------------------
777    /* 1 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Fast, search: super::strategy::SearchMethod::Fast, window_log: 19, lazy_depth: 0, fast: Some(FAST_L1), dfast: None, hc: None, row: None },
778    /* 2 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Fast, search: super::strategy::SearchMethod::Fast, window_log: 20, lazy_depth: 0, fast: Some(FAST_L2), dfast: None, hc: None, row: None },
779    /* 3 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Dfast, search: super::strategy::SearchMethod::DoubleFast, window_log: 21, lazy_depth: 1, fast: None, dfast: Some(DFAST_L3), hc: None, row: None },
780    /* 4 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Dfast, search: super::strategy::SearchMethod::DoubleFast, window_log: 21, lazy_depth: 1, fast: None, dfast: Some(DFAST_L4), hc: None, row: None },
781    // target_len column for L5..=L15 matches upstream zstd cParams.targetLength
782    // from clevels.h table[0] (default — srcSize > 256 KB). Upstream zstd uses
783    // it as the lazy outer loop's `sufficient_len` (nice-match) threshold.
784    // Inflating it above upstream zstd forces the chain walk to complete
785    // search_depth iterations instead of breaking on the first
786    // long-enough match — the dominant cost in the L5..=L15 speed
787    // regression vs FFI (see lazy_band_target_len_matches_default_table).
788    /* 5 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Greedy, search: super::strategy::SearchMethod::RowHash, window_log: 21, lazy_depth: 0, fast: None, dfast: None, hc: None, row: Some(ROW_L5) },
789    // L6-12: the upstream zstd runs the lazy/lazy2 strategies on the ROW-based
790    // match finder by default (`ZSTD_resolveRowMatchFinderMode`: row mode
791    // is on for greedy..lazy2 whenever SIMD is available) — a bounded
792    // SIMD tag scan per row instead of a pointer-chasing hash-chain walk.
793    // Our HashChain walk on these levels was ~75% of L10 wall time on the
794    // 1 MiB corpus (dependent chain-table loads). Same `RowConfig`
795    // derivation as `ROW_L5` above, upstream zstd values per level in the
796    // `ROW_L6..ROW_L12` comment block.
797    /* 6 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Lazy, search: super::strategy::SearchMethod::RowHash, window_log: 21, lazy_depth: 1, fast: None, dfast: None, hc: None, row: Some(ROW_L6) },
798    /* 7 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Lazy, search: super::strategy::SearchMethod::RowHash, window_log: 21, lazy_depth: 1, fast: None, dfast: None, hc: None, row: Some(ROW_L7) },
799    /* 8 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Lazy, search: super::strategy::SearchMethod::RowHash, window_log: 21, lazy_depth: 2, fast: None, dfast: None, hc: None, row: Some(ROW_L8) },
800    /* 9 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Lazy, search: super::strategy::SearchMethod::RowHash, window_log: 22, lazy_depth: 2, fast: None, dfast: None, hc: None, row: Some(ROW_L9) },
801    /*10 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Lazy, search: super::strategy::SearchMethod::RowHash, window_log: 22, lazy_depth: 2, fast: None, dfast: None, hc: None, row: Some(ROW_L10) },
802    /*11 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Lazy, search: super::strategy::SearchMethod::RowHash, window_log: 22, lazy_depth: 2, fast: None, dfast: None, hc: None, row: Some(ROW_L11) },
803    /*12 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Lazy, search: super::strategy::SearchMethod::RowHash, window_log: 22, lazy_depth: 2, fast: None, dfast: None, hc: None, row: Some(ROW_L12) },
804    // L13-15: reference uses btlazy2 (binary-tree finder) with searchLog 4/5/6
805    // (search_depth 16/32/64) and targetLength 32. We run the hash-chain Lazy
806    // parser here, so we mirror the reference search budget rather than inflate
807    // it: matching the table keeps speed near the reference and makes per-level
808    // perf divergences comparable. The binary-tree finder that would let a
809    // smaller searchLog find longer matches (and re-establish a strict ratio
810    // ladder above L12) is tracked separately; until it lands these levels sit
811    // close to L12 on hash-chain inputs by design.
812    /*13 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Btlazy2, search: super::strategy::SearchMethod::BinaryTree, window_log: 22, lazy_depth: 2, fast: None, dfast: None, hc: Some(HcConfig { hash_log: 22, chain_log: 22, search_depth: 16, target_len: 32, search_mls: 5 }), row: None },
813    /*14 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Btlazy2, search: super::strategy::SearchMethod::BinaryTree, window_log: 22, lazy_depth: 2, fast: None, dfast: None, hc: Some(HcConfig { hash_log: 23, chain_log: 22, search_depth: 32, target_len: 32, search_mls: 5 }), row: None },
814    /*15 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Btlazy2, search: super::strategy::SearchMethod::BinaryTree, window_log: 22, lazy_depth: 2, fast: None, dfast: None, hc: Some(HcConfig { hash_log: 23, chain_log: 23, search_depth: 64, target_len: 32, search_mls: 5 }), row: None },
815    /*16 */ LevelParams { strategy_tag: super::strategy::StrategyTag::BtOpt, search: super::strategy::SearchMethod::BinaryTree, window_log: 22, lazy_depth: 2, fast: None, dfast: None, hc: Some(HcConfig { hash_log: 22, chain_log: 22, search_depth: 32, target_len: 48, search_mls: 5 }), row: None },
816    /*17 */ LevelParams { strategy_tag: super::strategy::StrategyTag::BtOpt, search: super::strategy::SearchMethod::BinaryTree, window_log: 23, lazy_depth: 2, fast: None, dfast: None, hc: Some(HcConfig { hash_log: 22, chain_log: 23, search_depth: 32, target_len: 64, search_mls: 4 }), row: None },
817    /*18 */ LevelParams { strategy_tag: super::strategy::StrategyTag::BtUltra, search: super::strategy::SearchMethod::BinaryTree, window_log: 23, lazy_depth: 2, fast: None, dfast: None, hc: Some(HcConfig { hash_log: 22, chain_log: 23, search_depth: 64, target_len: 64, search_mls: 4 }), row: None },
818    /*19 */ LevelParams { strategy_tag: super::strategy::StrategyTag::BtUltra2, search: super::strategy::SearchMethod::BinaryTree, window_log: 23, lazy_depth: 2, fast: None, dfast: None, hc: Some(HcConfig { hash_log: 22, chain_log: 24, search_depth: 128, target_len: 256, search_mls: 4 }), row: None },
819    /*20 */ LevelParams { strategy_tag: super::strategy::StrategyTag::BtUltra2, search: super::strategy::SearchMethod::BinaryTree, window_log: 25, lazy_depth: 2, fast: None, dfast: None, hc: Some(HcConfig { hash_log: 23, chain_log: 25, search_depth: 128, target_len: 256, search_mls: 4 }), row: None },
820    /*21 */ LevelParams { strategy_tag: super::strategy::StrategyTag::BtUltra2, search: super::strategy::SearchMethod::BinaryTree, window_log: 26, lazy_depth: 2, fast: None, dfast: None, hc: Some(BTULTRA2_HC_CONFIG), row: None },
821    /*22 */ LevelParams { strategy_tag: super::strategy::StrategyTag::BtUltra2, search: super::strategy::SearchMethod::BinaryTree, window_log: 27, lazy_depth: 2, fast: None, dfast: None, hc: Some(BTULTRA2_HC_CONFIG_L22), row: None },
822];
823
824/// Upstream `ZSTD_createCDict` table geometry: the `(hash_log, chain_log)` a
825/// dictionary's prepared match-finder tables get. Thin adapter over the single
826/// cParams source [`super::cparams::create_cdict_table_logs`], which mirrors
827/// `ZSTD_adjustCParams_internal` under `ZSTD_cpm_createCDict`. `window_log` is
828/// the resolved compress window; `hash_log` / `chain_log` are the level's own
829/// widths; `uses_bt` selects the binary-tree `cycleLog` (`chainLog - 1`).
830fn cdict_table_logs(
831    window_log: u8,
832    hash_log: usize,
833    chain_log: usize,
834    uses_bt: bool,
835    dict_size: usize,
836) -> (usize, usize) {
837    let (h, c) = super::cparams::create_cdict_table_logs(
838        window_log,
839        hash_log as u32,
840        chain_log as u32,
841        uses_bt,
842        dict_size,
843    );
844    (h as usize, c as usize)
845}
846
847/// Smallest window_log the encoder will use regardless of source size.
848pub(crate) const MIN_WINDOW_LOG: u8 = 10;
849/// Conservative floor for source-size-hinted window tuning.
850///
851/// Hinted windows below 16 KiB (`window_log < 14`) currently regress C-FFI
852/// interoperability on certain compressed-block patterns. Keep hinted
853/// windows at 16 KiB or larger until that compatibility gap is closed.
854const MIN_HINTED_WINDOW_LOG: u8 = 14;
855
856/// Adjust level parameters for a known source size.
857///
858/// This derives a cap from `ceil(log2(src_size))`, then clamps it to
859/// [`MIN_HINTED_WINDOW_LOG`] (16 KiB). A zero-byte size hint is treated as
860/// [`MIN_WINDOW_LOG`] for the raw ceil-log step and then promoted to the hinted
861/// floor. This keeps tables bounded for small inputs while preserving the
862/// encoder's baseline minimum supported window.
863/// For the HC backend, `hash_log` and `chain_log` are reduced
864/// proportionally.
865/// Source-size tier index, matching upstream `ZSTD_getCParams_internal`'s
866/// `tableID = (rSize<=256K)+(rSize<=128K)+(rSize<=16K)`: 0 = > 256 KiB or
867/// unknown, 1 = 128..256 KiB, 2 = 16..128 KiB, 3 = <= 16 KiB.
868fn cparams_tier(source_size: Option<u64>) -> usize {
869    match source_size {
870        Some(size) if size <= 16 * 1024 => 3,
871        Some(size) if size <= 128 * 1024 => 2,
872        Some(size) if size <= 256 * 1024 => 1,
873        _ => 0,
874    }
875}
876
877/// Override a Fast (L1/L2) or Dfast (L3) level row's table-shaping cParams
878/// (hashLog / chainLog / minMatch) by source-size tier, matching the
879/// reference `ZSTD_defaultCParameters[tableID][level]`. L1 keeps its base
880/// hashLog (the source-size window clamp in `adjust_params_for_source_size`
881/// already lands on the reference value) and only tiers minMatch; L2 also
882/// tiers hashLog (the tier-0 value 16 oversized the table on medium inputs,
883/// the page-fault pathology); L3 tiers both dfast hash widths. Strategy
884/// switches (L2 tier 1, L4) are intentionally not applied here.
885fn apply_cparams_tier(level: i32, source_size: Option<u64>, p: &mut LevelParams) {
886    let tier = cparams_tier(source_size);
887    // Single source for the table data: the verbatim upstream
888    // `ZSTD_defaultCParameters[tier][level]` row (`cparams::default_cparams`).
889    // The encoder consumes only the table-shaping widths here; the window /
890    // `table_log` clamp lives in `adjust_params_for_source_size`.
891    match level {
892        // Fast, all tiers — minMatch only (hashLog handled by the window clamp).
893        1 => {
894            if let Some(f) = p.fast.as_mut() {
895                f.mls = super::cparams::default_cparams(tier, 1).min_match;
896            }
897        }
898        // Fast (base strategy; tier 1 is dfast upstream — not switched here).
899        2 => {
900            if let Some(f) = p.fast.as_mut() {
901                let cp = super::cparams::default_cparams(tier, 2);
902                f.hash_log = cp.hash_log;
903                f.mls = cp.min_match;
904            }
905        }
906        // Dfast, all tiers — long hashLog (`hash_log`) + short chainLog (`chain_log`).
907        3 => {
908            if let Some(d) = p.dfast.as_mut() {
909                let cp = super::cparams::default_cparams(tier, 3);
910                d.long_hash_log = cp.hash_log as u8;
911                d.short_hash_log = cp.chain_log as u8;
912            }
913        }
914        _ => {}
915    }
916}
917
918fn adjust_params_for_source_size(mut params: LevelParams, src_size: u64) -> LevelParams {
919    // Derive a source-size-based cap from ceil(log2(src_size)), then
920    // clamp first to MIN_WINDOW_LOG (baseline encoder minimum) and then to
921    // MIN_HINTED_WINDOW_LOG (16 KiB hinted floor). For tiny or zero hints we
922    // therefore keep a 16 KiB effective minimum window in hinted mode.
923    // Raw ceil(log2(src_size)) drives the internal table sizes. The
924    // advertised `window_log` is separately floored at MIN_HINTED_WINDOW_LOG
925    // (a decoder-interop requirement on the wire format), but the hash /
926    // chain table widths are internal and never appear in the frame, so they
927    // can track the actual source size below that floor.
928    let raw_src_log = source_size_ceil_log(src_size);
929    let src_log = raw_src_log.max(MIN_WINDOW_LOG).max(MIN_HINTED_WINDOW_LOG);
930    if src_log < params.window_log {
931        params.window_log = src_log;
932    }
933    // Internal match-finder tables are sized from `table_log` — the RAW
934    // source log (floored only at the baseline `MIN_WINDOW_LOG`), NOT the
935    // wire `window_log` floor. The table widths never appear in the frame, so
936    // for small inputs they can track the actual source size and avoid
937    // zeroing a window-sized table per frame; large inputs keep the level's
938    // widths. The cap is applied with the same per-backend headroom the
939    // level table uses, so the load factor (and match quality) is unchanged.
940    // The Dfast backend derives its table widths from the source in `reset`
941    // (`set_hash_bits` recomputes there), so it is not adjusted here. The Row
942    // backend's width IS capped here, mirroring the upstream zstd (see the Row branch).
943    let table_log = raw_src_log.max(MIN_WINDOW_LOG);
944    let backend = params.backend();
945    if backend == super::strategy::BackendTag::HashChain {
946        let hc = params
947            .hc
948            .as_mut()
949            .expect("HashChain level row carries an HcConfig");
950        if (table_log + 2) < hc.hash_log as u8 {
951            hc.hash_log = (table_log + 2) as usize;
952        }
953        if (table_log + 1) < hc.chain_log as u8 {
954            hc.chain_log = (table_log + 1) as usize;
955        }
956    } else if backend == super::strategy::BackendTag::Row {
957        let row = params
958            .row
959            .as_mut()
960            .expect("Row level row carries a RowConfig");
961        // Upstream zstd `ZSTD_adjustCParams_internal` (zstd_compress.c): once
962        // the window is source-capped, `hashLog <= windowLog + 1`. The row
963        // table is `2^hash_bits` slots, exactly upstream's row hashTable
964        // `2^hashLog` slots, so the same cap applies. Without it the row table
965        // stays at the level's unbounded width (e.g. L12 hash_bits 23 = 4x
966        // upstream's source-capped 21), the dominant peak-memory excess on the
967        // row band.
968        let row_cap = (table_log + 1) as usize;
969        if row_cap < row.hash_bits {
970            row.hash_bits = row_cap;
971        }
972    } else if backend == super::strategy::BackendTag::Simple {
973        let fast = params
974            .fast
975            .as_mut()
976            .expect("Fast level row carries a FastConfig");
977        let fast_cap = (table_log + 1) as u32;
978        if fast_cap < fast.hash_log {
979            fast.hash_log = fast_cap;
980        }
981    }
982    params
983}
984
985fn level22_btultra2_params_for_source_size(source_size: Option<u64>) -> LevelParams {
986    let mut hc = match source_size {
987        Some(size) if size <= 16 * 1024 => BTULTRA2_HC_CONFIG_L22_16K,
988        Some(size) if size <= 128 * 1024 => BTULTRA2_HC_CONFIG_L22_128K,
989        Some(size) if size <= 256 * 1024 => BTULTRA2_HC_CONFIG_L22_256K,
990        _ => BTULTRA2_HC_CONFIG_L22,
991    };
992    let mut window_log = match source_size {
993        Some(size) if size <= 16 * 1024 => 14,
994        Some(size) if size <= 128 * 1024 => 17,
995        Some(size) if size <= 256 * 1024 => 18,
996        _ => 27,
997    };
998    if let Some(size) = source_size
999        && size > 256 * 1024
1000    {
1001        let src_log = source_size_ceil_log(size);
1002        window_log = window_log.min(src_log.max(MIN_WINDOW_LOG));
1003        let adjusted_table_log = window_log as usize + 1;
1004        hc.hash_log = hc.hash_log.min(adjusted_table_log);
1005        hc.chain_log = hc.chain_log.min(adjusted_table_log);
1006    }
1007    LevelParams {
1008        strategy_tag: super::strategy::StrategyTag::BtUltra2,
1009        search: super::strategy::SearchMethod::BinaryTree,
1010        window_log,
1011        lazy_depth: 2,
1012        fast: None,
1013        dfast: None,
1014        hc: Some(hc),
1015        row: None,
1016    }
1017}
1018
1019/// Estimated steady-state heap footprint of a one-shot compression context
1020/// at `level` (window history + match-finder tables + block staging), in
1021/// bytes. Computed from the same per-level tuning table the encoder
1022/// resolves at frame start, so the estimate tracks the real allocations;
1023/// it is an upper-bound style budget figure, not an exact accounting.
1024pub fn estimated_compression_workspace_bytes(level: CompressionLevel) -> usize {
1025    use super::strategy::StrategyTag;
1026    let params = resolve_level_params(level, None);
1027    let window = 1usize << params.window_log;
1028    // Mirror `configure()`: the HC3 short-match side table exists only on
1029    // the btultra/btultra2 tags (minMatch 3), capped by the window log; the
1030    // BT pointer-pair layout fits inside the `4 << chain_log` chain term
1031    // (pairs over `chain_log - 1` nodes).
1032    let wants_hash3 = matches!(
1033        params.strategy_tag,
1034        StrategyTag::BtUltra | StrategyTag::BtUltra2
1035    );
1036    let uses_bt = matches!(
1037        params.strategy_tag,
1038        StrategyTag::Btlazy2 | StrategyTag::BtOpt | StrategyTag::BtUltra | StrategyTag::BtUltra2
1039    );
1040    let tables = params.fast.map(|f| 4usize << f.hash_log).unwrap_or(0)
1041        + params
1042            .dfast
1043            .map(|d| (4usize << d.long_hash_log) + (4usize << d.short_hash_log))
1044            .unwrap_or(0)
1045        + params
1046            .hc
1047            .map(|h| {
1048                let hash3 = if wants_hash3 {
1049                    4usize
1050                        << super::match_table::storage::HC3_HASH_LOG.min(params.window_log as usize)
1051                } else {
1052                    0
1053                };
1054                (4usize << h.hash_log) + (4usize << h.chain_log) + hash3
1055            })
1056            .unwrap_or(0)
1057        + params
1058            .row
1059            .map(|r| (4usize << r.hash_bits) + (2usize << r.hash_bits))
1060            .unwrap_or(0);
1061    // BT modes box a `BtMatcher`; its retained scratch layout is budgeted
1062    // next to the struct so estimator and allocator evolve together.
1063    let bt = if uses_bt {
1064        super::bt::BtMatcher::estimated_workspace_bytes()
1065    } else {
1066        0
1067    };
1068    // Block staging: literal + sequence buffers plus the compressed-block
1069    // scratch, each bounded by the 128 KiB block size.
1070    let staging = 3 * (128 * 1024);
1071    window + tables + bt + staging
1072}
1073
1074/// Extra steady-state workspace the binary-tree strategies (ordinals 6..=9,
1075/// btlazy2..btultra2) retain beyond the hash/chain tables: the boxed matcher
1076/// plus its scratch arenas, and the HC3 short-match side table for
1077/// btultra/btultra2 (capped by the window log). 0 for non-BT ordinals.
1078pub fn estimated_bt_strategy_extra_bytes(strategy_ordinal: u32, window_log: u32) -> usize {
1079    if !(6..=9).contains(&strategy_ordinal) {
1080        return 0;
1081    }
1082    let hash3 = if matches!(strategy_ordinal, 8 | 9) {
1083        4usize << super::match_table::storage::HC3_HASH_LOG.min(window_log as usize)
1084    } else {
1085        0
1086    };
1087    super::bt::BtMatcher::estimated_workspace_bytes() + hash3
1088}
1089
1090/// Resolve a [`CompressionLevel`] (+ optional source-size hint) to the
1091/// concrete [`LevelParams`] the matcher runs: strategy tag, search method
1092/// (match-finder), window log, and per-backend config.
1093///
1094/// ## CRITICAL: input size changes the match-finder (and can change strategy)
1095///
1096/// The resolved geometry is a function of the SOURCE SIZE, not the level
1097/// alone. This is the easy-to-miss part (so read this before assuming a level
1098/// maps to one fixed match-finder). It mirrors three upstream zstd stages:
1099///
1100/// 1. [`LEVEL_TABLE`] holds the tier-0 (source > 256 KiB) base row per level
1101///    (upstream `ZSTD_defaultCParameters[0]`). L6-L12 carry
1102///    `SearchMethod::RowHash` (the Row match-finder), like upstream's
1103///    greedy/lazy default.
1104/// 2. [`apply_cparams_tier`] overrides the table-shaping widths for the
1105///    smaller source tiers (upstream `ZSTD_getCParams_internal` tier table).
1106///    NOTE: upstream ALSO switches STRATEGY in some tiers (L2 → dfast, L4 →
1107///    greedy on small sources); those backend switches are NOT yet replicated,
1108///    so those levels keep their base strategy on small inputs.
1109/// 3. [`adjust_params_for_source_size`] caps `window_log` to
1110///    ~`ceil_log2(source_size)` (upstream `ZSTD_adjustCParams_internal`).
1111///
1112/// THEN, in the matcher `reset`, the greedy/lazy band falls back from
1113/// `RowHash` to `SearchMethod::HashChain` when the resolved `window_log <= 14`
1114/// — exactly upstream's `ZSTD_resolveRowMatchFinderMode` (the Row match-finder
1115/// is used for greedy/lazy/lazy2 ONLY when `windowLog > 14`). Net effect for
1116/// the SAME level:
1117///
1118/// * small input (e.g. a 10 KiB fixture → `window_log` 14) → **HashChain**
1119///   (`ZSTD_HcFindBestMatch`, scalar chain walk);
1120/// * large input (e.g. 1 MiB → `window_log` 20) → **RowHash** (the SIMD-tag
1121///   row match-finder).
1122///
1123/// A dictionary does NOT change the match-finder: it only downsizes the
1124/// prepared tables (`cdict_table_logs`, mirroring `ZSTD_createCDict`'s
1125/// small-source assumption), while `window_log` stays source-derived. So
1126/// `(L6, 10 KiB, +dict)` is HashChain and `(L6, 1 MiB, +dict)` is RowHash,
1127/// both matching upstream. When comparing against C on a fixture, resolve the
1128/// match-finder from the fixture's size first, or you may optimise/benchmark a
1129/// path C does not even take for that input.
1130fn resolve_level_params(level: CompressionLevel, source_size: Option<u64>) -> LevelParams {
1131    if matches!(level, CompressionLevel::Level(22)) {
1132        return level22_btultra2_params_for_source_size(source_size);
1133    }
1134    let params = match level {
1135        CompressionLevel::Uncompressed => LevelParams {
1136            strategy_tag: super::strategy::StrategyTag::Fast,
1137            search: super::strategy::SearchMethod::Fast,
1138            // Uncompressed frames emit raw blocks and never reference
1139            // history; advertising a larger window only inflates
1140            // decoder-side buffer reservation. Stay at 17 (128 KiB).
1141            window_log: 17,
1142            lazy_depth: 0,
1143            // Beyond-upstream zstd: hash_log=14 (vs upstream zstd's 13) for 2× fewer
1144            // collisions on structured corpora. Upstream zstd's "base for negative"
1145            // row has targetLength=1 → step_size = 1 + 0 + 1 = 2.
1146            fast: Some(FastConfig {
1147                hash_log: 14,
1148                mls: 6,
1149                step_size: 2,
1150            }),
1151            dfast: None,
1152            hc: None,
1153            row: None,
1154        },
1155        CompressionLevel::Fastest => {
1156            // Only the Fast-specific cParams
1157            // (fast_hash_log / fast_mls / fast_step_size) align
1158            // with Uncompressed / negative-base row. window_log
1159            // stays at LEVEL_TABLE[0]'s value (19) — Fastest still
1160            // does real compression on a full window, unlike
1161            // Uncompressed which clamps to 17.
1162            let mut p = LEVEL_TABLE[0];
1163            p.fast = Some(FastConfig {
1164                hash_log: 14,
1165                mls: 6,
1166                step_size: 2,
1167            });
1168            p
1169        }
1170        CompressionLevel::Default => {
1171            // Default == Level(DEFAULT_LEVEL); tier it the same way an explicit
1172            // positive level is, so hinted default compression shrinks its
1173            // table widths on small / medium frames instead of keeping the
1174            // tier-0 row (the oversized-table page-fault pathology).
1175            let mut p = LEVEL_TABLE[CompressionLevel::DEFAULT_LEVEL as usize - 1];
1176            apply_cparams_tier(CompressionLevel::DEFAULT_LEVEL, source_size, &mut p);
1177            p
1178        }
1179        CompressionLevel::Better => LEVEL_TABLE[6],
1180        // Level 13: the first dominant point of the deep-lazy band. The
1181        // mls-wide row key lifted the shallow band's ratio enough that
1182        // level 11 no longer strictly beats level 7 on the ladder corpus;
1183        // the `Best` alias belongs on a config that dominates everything
1184        // below it rather than on a hair-thin margin.
1185        CompressionLevel::Best => LEVEL_TABLE[12],
1186        CompressionLevel::Level(n) => {
1187            if n > 0 {
1188                let idx = (n as usize).min(CompressionLevel::MAX_LEVEL as usize) - 1;
1189                let mut p = LEVEL_TABLE[idx];
1190                // Upstream zstd selects the cParams row from a 4-way
1191                // source-size-tiered table (`ZSTD_getCParams_internal` →
1192                // `ZSTD_defaultCParameters[tableID][level]`), and the Fast /
1193                // Dfast hashLog, chainLog and minMatch shrink for smaller
1194                // inputs. The `LEVEL_TABLE` base is the tier-0 (> 256 KiB) row;
1195                // override the table-shaping params per tier here so small and
1196                // medium frames use the reference's table widths (the oversized
1197                // tier-0 widths were a per-frame alloc / page-fault pathology on
1198                // medium inputs) and minMatch (short matches the wide hash
1199                // skips). NOTE: the reference also switches STRATEGY in some
1200                // tiers (L2 → dfast at 128..256 KiB, L4 → greedy at <= 16 KiB
1201                // and 128..256 KiB); those backend switches are not yet tiered,
1202                // so those tiers keep the base strategy.
1203                apply_cparams_tier(n, source_size, &mut p);
1204                p
1205            } else if n == 0 {
1206                // Level 0 = default, matching C zstd semantics. Tier it like the
1207                // `Default` alias so `Level(0)` and `Default` stay identical.
1208                let mut p = LEVEL_TABLE[CompressionLevel::DEFAULT_LEVEL as usize - 1];
1209                apply_cparams_tier(CompressionLevel::DEFAULT_LEVEL, source_size, &mut p);
1210                p
1211            } else {
1212                // Negative levels — upstream zstd sets
1213                // targetLength = -level (clampedCompressionLevel),
1214                // yielding step_size = (-level) + 1 since
1215                // !(targetLength) = 0 when targetLength > 0.
1216                // So L-1..L-7 get step_size 2..8. Acceleration
1217                // gradient comes from larger step skipping more
1218                // positions per iter (faster, worse ratio).
1219                // Clamp to upstream zstd's MIN_LEVEL before negating so
1220                // i32::MIN can't overflow on `-n`.
1221                let clamped = n.max(CompressionLevel::MIN_LEVEL);
1222                let target_length = (-clamped) as usize;
1223                let step_size = target_length + 1;
1224                // Upstream zstd row-0 ("base for negative", clevels.h srcSize>256KB):
1225                // hashLog=13, minMatch=7. The 32 KiB hash table (2^13 * 4B)
1226                // is L1d-resident on contemporary cores, so every probe is an
1227                // L1 hit; hashLog=14 (64 KiB) overflows a 32 KiB L1d and turns
1228                // each probe into an L2 access. minMatch=7 (vs 6) skips
1229                // short-distance 6-byte matches: fewer sequences, less
1230                // extension/emit work, and parity with the upstream zstd's negative
1231                // ladder on both ratio and throughput.
1232                LevelParams {
1233                    strategy_tag: super::strategy::StrategyTag::Fast,
1234                    search: super::strategy::SearchMethod::Fast,
1235                    window_log: 19,
1236                    lazy_depth: 0,
1237                    fast: Some(FastConfig {
1238                        hash_log: 13,
1239                        mls: 7,
1240                        step_size,
1241                    }),
1242                    dfast: None,
1243                    hc: None,
1244                    row: None,
1245                }
1246            }
1247        }
1248    };
1249    if let Some(size) = source_size {
1250        adjust_params_for_source_size(params, size)
1251    } else {
1252        params
1253    }
1254}
1255
1256/// The cheap fingerprint pre-splitter level for a compression level (the
1257/// C-like `blockSplitterLevel`), resolved through the same per-level
1258/// `LevelParams` table as every other tuning knob. `None` keeps the whole
1259/// 128 KiB block. The frame loop reads this instead of hardcoding the
1260/// level→split mapping at the call site.
1261pub(crate) fn level_pre_split(level: CompressionLevel) -> Option<usize> {
1262    // Resolve through `resolve_level_params` directly — NOT via the legacy
1263    // `numeric_level()` alias — so named presets read the SAME table row as
1264    // every other tuning knob (`Best` maps to its own row there, which is
1265    // not the row its numeric alias points at). `Uncompressed` (raw
1266    // blocks) never splits.
1267    if matches!(level, CompressionLevel::Uncompressed) {
1268        return None;
1269    }
1270    resolve_level_params(level, None)
1271        .pre_split()
1272        .map(usize::from)
1273}
1274
1275/// Backend storage for [`MatchGeneratorDriver`]. Exactly one match-finder
1276/// state lives in the driver at a time — the active variant. Backend
1277/// transitions in [`Matcher::reset`] drain the current variant's allocations
1278/// into the shared `vec_pool` and then replace `storage` with a freshly
1279/// constructed variant for the new backend.
1280///
1281/// Replaces the prior pattern of four parallel fields (`match_generator`,
1282/// `dfast_match_generator: Option<…>`, `row_match_generator: Option<…>`,
1283/// `hc_match_generator: Option<…>`) + an `active_backend: BackendTag`
1284/// discriminator: the parallel layout kept drained inner structures
1285/// allocated across backend switches, and every per-frame/per-slice
1286/// driver operation had to dispatch on `active_backend` to pick the
1287/// right field. A single enum collapses the storage and makes the
1288/// dispatcher pattern-match on the storage variant directly — same
1289/// number of arms, but `storage.backend()` is now the canonical source
1290/// of truth and dead variants are dropped when the active backend
1291/// changes.
1292#[derive(Clone)]
1293enum MatcherStorage {
1294    /// Upstream zstd `ZSTD_fast` family. Constructed by
1295    /// [`MatchGeneratorDriver::new`] as the initial variant and
1296    /// re-selected by [`Matcher::reset`] for any [`CompressionLevel`]
1297    /// that `resolve_level_params` maps to [`StrategyTag::Fast`]
1298    /// (`Uncompressed`, `Fastest`, `Level(1)`, and any non-positive
1299    /// `Level(n)` not equal to `0`).
1300    Simple(FastKernelMatcher),
1301    /// Upstream zstd `ZSTD_dfast` family — two-table hash chain. Selected for
1302    /// any level that resolves to [`StrategyTag::Dfast`] in
1303    /// `resolve_level_params` (`Default`, `Level(0)`, `Level(2)`,
1304    /// `Level(3)`).
1305    Dfast(DfastMatchGenerator),
1306    /// Upstream zstd `ZSTD_greedy` family with row hashing. Selected for any
1307    /// level that resolves to [`StrategyTag::Greedy`] (currently
1308    /// `Level(4)` only).
1309    Row(RowMatchGenerator),
1310    /// Upstream zstd `ZSTD_lazy2` and the BT-based optimal modes
1311    /// (`btopt` / `btultra` / `btultra2`). Selected for any level that
1312    /// resolves to [`StrategyTag::Lazy`], [`StrategyTag::BtOpt`],
1313    /// [`StrategyTag::BtUltra`], or [`StrategyTag::BtUltra2`]
1314    /// (`Better`, `Best`, `Level(5..=22)`, and any `Level(n)` with
1315    /// `n > MAX_LEVEL` — `resolve_level_params` clamps positive
1316    /// numeric levels at `MAX_LEVEL = 22` via
1317    /// `Level(n).clamp(1, MAX_LEVEL)`, so `Level(23..=i32::MAX)` all
1318    /// land on `BtUltra2` here). The [`HcMatchGenerator`]'s internal
1319    /// [`HcBackend`] discriminator decides whether BT scratch is
1320    /// allocated.
1321    HashChain(HcMatchGenerator),
1322}
1323
1324impl MatcherStorage {
1325    /// Heap bytes the active backend variant holds (tables, history, scratch).
1326    fn heap_size(&self) -> usize {
1327        match self {
1328            Self::Simple(m) => m.heap_size(),
1329            Self::Dfast(m) => m.heap_size(),
1330            Self::Row(m) => m.heap_size(),
1331            Self::HashChain(m) => m.heap_size(),
1332        }
1333    }
1334
1335    /// [`super::strategy::BackendTag`] family of the active variant.
1336    fn backend(&self) -> super::strategy::BackendTag {
1337        use super::strategy::BackendTag;
1338        match self {
1339            Self::Simple(_) => BackendTag::Simple,
1340            Self::Dfast(_) => BackendTag::Dfast,
1341            Self::Row(_) => BackendTag::Row,
1342            Self::HashChain(_) => BackendTag::HashChain,
1343        }
1344    }
1345}
1346
1347/// This is the default implementation of the `Matcher` trait. It allocates and reuses the buffers when possible.
1348pub struct MatchGeneratorDriver {
1349    vec_pool: Vec<Vec<u8>>,
1350    /// Active match-finder state. Exactly one backend lives here at a
1351    /// time; [`Matcher::reset`] drains the previous variant into
1352    /// `vec_pool` before swapping in a freshly constructed variant for
1353    /// the new backend. `storage.backend()` is the canonical source of
1354    /// truth for the parse family; `strategy_tag` carries the
1355    /// compile-time strategy chosen at the last `reset()`.
1356    storage: MatcherStorage,
1357    // Compile-time strategy tag resolved at `reset()` from the
1358    // requested `CompressionLevel`'s `LevelParams`. The driver's
1359    // hot-block dispatcher in `blocks/compressed.rs` matches on
1360    // this tag to enter the corresponding `Strategy`
1361    // monomorphisation (`compress_block::<S>`).
1362    strategy_tag: super::strategy::StrategyTag,
1363    // Decoupled search-method axis resolved at `reset()` from
1364    // `LevelParams.search`. The per-block dispatcher routes on this
1365    // (not on `strategy_tag`) so a level's parse and search backend can
1366    // be chosen independently. The `BinaryTree` arm still consults
1367    // `strategy_tag` to pick the opt `Strategy` ZST.
1368    search: super::strategy::SearchMethod,
1369    // Decoupled parse-mode axis resolved at `reset()` from
1370    // `LevelParams::parse()`. Independent of `search`: greedy / lazy /
1371    // lazy2 can run on any non-opt search backend. The backends still
1372    // read their own `lazy_depth` (kept in sync at `reset()`); this is
1373    // the authoritative parse selector for the dispatcher.
1374    parse: super::strategy::ParseMode,
1375    /// Test-only per-level recipe override applied in `reset()` before
1376    /// backend selection. Lets the parse×search matrix be exercised
1377    /// without editing `LEVEL_TABLE`; never compiled into production.
1378    #[cfg(test)]
1379    config_override: Option<(super::strategy::SearchMethod, super::strategy::ParseMode)>,
1380    /// Fine-grained per-knob overrides from the public
1381    /// [`super::parameters::CompressionParameters`] surface (#27).
1382    /// `None` (or an all-`None` [`super::parameters::ParamOverrides`])
1383    /// keeps the resolved level geometry byte-identical to plain
1384    /// level-based compression. Applied in [`Matcher::reset`] after the
1385    /// level params are resolved, before backend selection. Persists
1386    /// across resets (it is frame configuration, not a one-shot) until
1387    /// the caller changes it.
1388    param_overrides: Option<super::parameters::ParamOverrides>,
1389    slice_size: usize,
1390    base_slice_size: usize,
1391    // Frame header window size must stay at the configured live-window budget.
1392    // Dictionary retention expands internal matcher capacity only.
1393    reported_window_size: usize,
1394    // Tracks currently retained bytes that originated from primed dictionary
1395    // history and have not been evicted yet.
1396    dictionary_retained_budget: usize,
1397    // Source size hint for next frame (set via set_source_size_hint, cleared on reset).
1398    source_size_hint: Option<u64>,
1399    // Dictionary content size for the next frame (set via set_dictionary_size_hint,
1400    // consumed on reset). When present on a binary-tree / hash-chain backend, the
1401    // match-finder hash/chain tables are sized from the DICTIONARY (upstream zstd CDict
1402    // economics: a loaded dictionary supplies the long matches, so the live tables
1403    // can shrink to the dict's size tier) while the eviction window stays
1404    // source-sized. Mirrors upstream zstd `ZSTD_getCParamRowSize`, which picks the cParams
1405    // table column from `dictSize` for a dictionary-bearing compress.
1406    dictionary_size_hint: Option<usize>,
1407    // Normalized `ceil_log2` bucket of the frame's source-size hint, captured at
1408    // `reset` (where `source_size_hint` is consumed) via [`source_size_ceil_log`].
1409    // `None` means the frame was unhinted. Drives `prime_with_dictionary`'s upstream zstd
1410    // `ZSTD_shouldAttachDict` mode for the Simple/Fast backend: `None` (unknown)
1411    // or `<= FAST_ATTACH_DICT_CUTOFF_LOG` → attach (separate dict table, 2-cursor
1412    // `compress_block_fast_dict`); larger → copy (dictionary primed into the live
1413    // table, 4-cursor `compress_block_fast`). The primed-snapshot key is the
1414    // resolved shape ([`reset_shape`](Self::reset_shape)), not this bucket.
1415    reset_size_log: Option<u8>,
1416    // Whether the loaded dictionary fits the Fast attach path's tagged position
1417    // field (`<= MAX_FAST_ATTACH_DICT_REGION`). Captured at `reset` from the
1418    // dict-size hint (which equals the actual dict length on load) so the Fast
1419    // attach decision, the attach-epoch reset bit, and the primed-snapshot
1420    // `fast_attach` bit all gate on it consistently. `true` when there is no
1421    // dictionary (the attach path is then unused). A dict too large to tag falls
1422    // back to copy mode instead of overflowing the packed position.
1423    reset_dict_attach_ok: bool,
1424    // Hint-resolved matcher shape from the last `reset`: the [`LevelParams`], the
1425    // active backend's applied Dfast/Row hash-table width (`0` for HC/Fast), the
1426    // Fast attach-vs-copy mode, and the active LDM override (#27). Combined with
1427    // the frame's level into the [`PrimedKey`] that keys the primed snapshot, so
1428    // it is only restored into a reset that resolved the identical matcher AND
1429    // LDM configuration. `None` before the first `reset`.
1430    reset_shape: Option<(
1431        LevelParams,
1432        usize,
1433        bool,
1434        Option<super::parameters::LdmOverride>,
1435    )>,
1436    // One-shot borrowed block range `[start, end)` staged by the borrowed
1437    // Fast frame path (`set_borrowed_block`) for the NEXT
1438    // `start_matching` / `skip_matching_with_hint`. `Some` routes that
1439    // call to the Simple backend's borrowed scan instead of the owned
1440    // committed-block path; consumed (reset to `None`) by the routed
1441    // call. Always `None` on the owned streaming path.
1442    borrowed_pending: Option<(usize, usize)>,
1443    /// CDict-equivalent: snapshot of the post-prime matcher state taken
1444    /// once after the first dictionary prime — the backend `storage`
1445    /// (hash tables + dictionary history + offset history + window) plus
1446    /// the driver-level `dictionary_retained_budget`, the only two pieces
1447    /// `prime_with_dictionary` writes. Subsequent frames restore this
1448    /// (a table memcpy) instead of re-hashing every dictionary position,
1449    /// mirroring upstream zstd `ZSTD_compressBegin_usingCDict` copying the
1450    /// precomputed `cdict->matchState`. Invalidated when the dictionary
1451    /// changes; keyed by the [`PrimedKey`] resolved matcher shape so a snapshot
1452    /// is only restored into a reset that produces the same matcher — see
1453    /// `restore_primed_dictionary`.
1454    primed: Option<(MatcherStorage, usize, PrimedKey)>,
1455}
1456
1457/// Identity of the matcher configuration a primed snapshot was captured under:
1458/// the FULLY RESOLVED matcher shape, not the raw source-size hint.
1459///
1460/// `reset()` resolves the hint into a [`LevelParams`] (window_log cap, the
1461/// HC/Fast table and search geometry, the parse depth/target-length that get
1462/// baked into the restored `storage`) plus, for the Dfast/Row backends, a
1463/// table-width derived from the hint's ceil-log bucket. The mapping from hint
1464/// to resolved shape is many-to-one: the source-size adjustment is monotone in
1465/// `ceil_log2(hint)`, and Level 22 additionally collapses several buckets onto
1466/// one upstream zstd tier (its `<= 16/128/256 KiB` thresholds). Keying on the raw hint
1467/// (or even its ceil-log bucket) therefore over-keys — two hints that resolve
1468/// to the identical matcher would each force a full re-prime. Keying on the
1469/// resolved (`params`, `table_bits`) pair restores across them.
1470///
1471/// `table_bits` is the hint-dependent hash-table width the ACTIVE backend
1472/// applied (`set_hash_bits` value for Dfast/Row; `0` for HC/Fast, whose widths
1473/// already live in `params`). The snapshot is only ever captured on the COPY
1474/// path (a hinted, above-cutoff frame), so `table_bits` is always the resolved
1475/// Dfast/Row value there, never the unhinted default.
1476///
1477/// `level` is kept alongside the resolved `params` because some stored matcher
1478/// state is derived from the level DIRECTLY, not through `params`: e.g. Dfast's
1479/// `use_fast_loop` is true for L3 but false for L4, yet L3 and L4 resolve to
1480/// byte-identical `params`. Without `level` a snapshot captured at L3 could be
1481/// restored into an L4 reset, installing the wrong `use_fast_loop`.
1482///
1483/// `fast_attach` records the Fast backend's attach-vs-copy mode
1484/// ([`FAST_ATTACH_DICT_CUTOFF_LOG`]) because that cutoff (8 KiB) falls INSIDE a
1485/// single resolved shape: an 8192- and an 8193-byte Level 1 hint both clamp to
1486/// window_log 14 with identical `params`/`table_bits`, yet 8192 attaches (a
1487/// separate dict table) while 8193 copies into the live table — two different
1488/// `storage` shapes. The frame compressor only captures/restores snapshots on
1489/// the copy path today, but keying on the mode keeps the snapshot identity
1490/// self-sufficient rather than relying on that external gate.
1491///
1492/// Restoring a snapshot whose key differs would reinstate the old `storage`
1493/// (and its `max_window_size` / table dimensions / parse params / dict-table
1494/// shape) under a reset that resolved a different shape — the encoder could
1495/// then search past the frame header's window and emit an undecodable match.
1496/// All fields must match before a restore is allowed.
1497#[derive(Clone, Copy, PartialEq, Eq)]
1498struct PrimedKey {
1499    level: super::CompressionLevel,
1500    params: LevelParams,
1501    table_bits: usize,
1502    fast_attach: bool,
1503    /// Fine-grained LDM override (#27) active at capture time. The
1504    /// snapshot's cloned `storage` carries `BtMatcher::ldm_producer`,
1505    /// which is configured from this override; restoring a snapshot
1506    /// captured under a different LDM configuration (enable flip or
1507    /// changed knobs) would reinstate a stale producer. `params` already
1508    /// pins `window_log` / `strategy_tag` (the rest of the producer's
1509    /// identity), so folding the override completes the LDM identity.
1510    /// `None` = LDM off, matching `ParamOverrides::ldm`.
1511    ldm: Option<super::parameters::LdmOverride>,
1512}
1513
1514impl MatchGeneratorDriver {
1515    /// `slice_size` sets the base block allocation size used for matcher input chunks.
1516    /// `max_slices_in_window` determines the initial window capacity at construction
1517    /// time. Effective window sizing is recalculated on every [`reset`](Self::reset)
1518    /// from the resolved compression level and optional source-size hint.
1519    pub(crate) fn new(slice_size: usize, max_slices_in_window: usize) -> Self {
1520        // Validate inputs before deriving window_log_init. Three
1521        // failure modes need explicit guards:
1522        //
1523        // 1. Zero args → `max_window_size = 0` → silent 1-byte
1524        //    degenerate window (useless).
1525        // 2. Multiplication overflow on `slice_size *
1526        //    max_slices_in_window` → wraps silently in release.
1527        // 3. `next_power_of_two` overflow when the product is
1528        //    above `1 << (usize::BITS - 1)` → modern Rust PANICS
1529        //    on overflow (older Rust returned 0).
1530        //
1531        // Catch all three at construction with a clear domain-
1532        // specific message via `assert!` + `checked_mul` +
1533        // `checked_next_power_of_two`, rather than letting either
1534        // mode produce a silent degenerate matcher OR a generic
1535        // panic deep in `FastKernelMatcher::with_params`.
1536        assert!(
1537            slice_size > 0,
1538            "MatchGeneratorDriver::new requires slice_size > 0 (got 0)",
1539        );
1540        assert!(
1541            max_slices_in_window > 0,
1542            "MatchGeneratorDriver::new requires max_slices_in_window > 0 (got 0)",
1543        );
1544        let max_window_size = max_slices_in_window
1545            .checked_mul(slice_size)
1546            .expect("MatchGeneratorDriver::new: slice_size * max_slices_in_window overflows usize");
1547        // Derive an effective window_log for the initial-state matcher.
1548        // `MatchGeneratorDriver::new` runs BEFORE any reset, so it has
1549        // no LevelParams to consult — we initialise to whatever
1550        // window_log fits the caller's requested max_window_size
1551        // (round up to the next power of two via `next_power_of_two`'s
1552        // log). Reset() overwrites all three params from the resolved
1553        // LevelParams.
1554        //
1555        // `checked_next_power_of_two` returns `None` if the next power
1556        // of two would overflow `usize`. Modern Rust's
1557        // `next_power_of_two` PANICS on overflow rather than returning
1558        // 0 (the panic message is generic and unhelpful), so use the
1559        // checked variant to surface the failure with a clear,
1560        // domain-specific error.
1561        let next_pow2 = max_window_size.checked_next_power_of_two().expect(
1562            "MatchGeneratorDriver::new: max_window_size too large for \
1563             next_power_of_two without overflow",
1564        );
1565        let window_log_init = next_pow2.trailing_zeros() as u8;
1566        Self {
1567            vec_pool: Vec::new(),
1568            // Deferred table: `new` runs before any source size or resolved
1569            // LevelParams exist, so allocating at the level-default hash_log
1570            // here would be thrown away by the first frame's reset (which
1571            // clamps the window to the input and reallocs at the resolved
1572            // size). The deferral lets that first reset allocate exactly once.
1573            storage: MatcherStorage::Simple(FastKernelMatcher::with_params_deferred(
1574                window_log_init,
1575                FAST_LEVEL_1_HASH_LOG,
1576                FAST_LEVEL_1_MLS,
1577                2, // upstream zstd default step_size (targetLength=0 → step=2)
1578            )),
1579            strategy_tag: super::strategy::StrategyTag::Fast,
1580            search: super::strategy::SearchMethod::Fast,
1581            parse: super::strategy::ParseMode::Greedy,
1582            #[cfg(test)]
1583            config_override: None,
1584            param_overrides: None,
1585            slice_size,
1586            base_slice_size: slice_size,
1587            // Report the ROUNDED-UP window size that the matcher
1588            // actually carries (via `window_log_init = log2(next_pow2)`
1589            // → matcher's `max_window_size = 1 << window_log_init =
1590            // next_pow2`). For non-power-of-two `slice_size *
1591            // max_slices_in_window` inputs, the unrounded value
1592            // would under-report the active backend's window until
1593            // the first `reset()` overwrites both sides from the
1594            // resolved LevelParams.
1595            reported_window_size: next_pow2,
1596            reset_size_log: None,
1597            reset_dict_attach_ok: true,
1598            reset_shape: None,
1599            dictionary_retained_budget: 0,
1600            source_size_hint: None,
1601            dictionary_size_hint: None,
1602            borrowed_pending: None,
1603            primed: None,
1604        }
1605    }
1606
1607    fn level_params(level: CompressionLevel, source_size: Option<u64>) -> LevelParams {
1608        resolve_level_params(level, source_size)
1609    }
1610
1611    /// Install the public-parameter per-knob overrides (#27) applied at
1612    /// the next [`Matcher::reset`]. `None` (or an all-`None` set) restores
1613    /// plain level-based geometry. Persists across resets until changed.
1614    pub(crate) fn set_param_overrides(
1615        &mut self,
1616        overrides: Option<super::parameters::ParamOverrides>,
1617    ) {
1618        self.param_overrides = overrides;
1619    }
1620
1621    /// Active backend family derived from the storage variant. Single
1622    /// source of truth — no separate runtime tag to drift against.
1623    pub(crate) fn active_backend(&self) -> super::strategy::BackendTag {
1624        self.storage.backend()
1625    }
1626
1627    /// Whether the borrowed (no-copy, in-place over-window) scan is
1628    /// implemented for the current backend + search configuration. The
1629    /// HashChain backend serves both the lazy CHAIN parser
1630    /// (`SearchMethod::HashChain`) and the BT/optimal parsers
1631    /// (`SearchMethod::BinaryTree`); only the lazy chain has a borrowed scan
1632    /// so far, so BT/optimal stay on the owned path.
1633    pub(crate) fn borrowed_supported(&self) -> bool {
1634        use super::strategy::{BackendTag, SearchMethod, StrategyTag};
1635        match self.active_backend() {
1636            BackendTag::Simple | BackendTag::Dfast | BackendTag::Row => true,
1637            // The HashChain backend covers two searches: the lazy CHAIN parser
1638            // (borrowed-capable) and the BINARY-TREE search (btlazy2 L13-15 +
1639            // optimal BtOpt/BtUltra/BtUltra2 L16-22). btlazy2's BT-tree borrowed
1640            // scan is byte-identical to owned (reads via live_history()), so it
1641            // takes the in-place path. The OPTIMAL parsers stay owned: their
1642            // cost-based DP is sensitive to candidate quality, and the borrowed
1643            // continuous-index scan yields slightly different (ratio-worse)
1644            // candidates than the owned evict+rehash scan — borrowed optimal
1645            // both diverged from owned and fell outside the ffi ratio bound.
1646            // Search-aware (not just strategy_tag) so optimal BT can never be
1647            // staged on the borrowed path even via an internal caller.
1648            BackendTag::HashChain => match self.search {
1649                SearchMethod::HashChain => true,
1650                SearchMethod::BinaryTree => matches!(self.strategy_tag, StrategyTag::Btlazy2),
1651                _ => false,
1652            },
1653        }
1654    }
1655
1656    /// Whether a DICTIONARY frame can take the borrowed (no input copy) path.
1657    /// Only the Simple (Fast) backend with the dictionary ATTACHED (not the
1658    /// copy/merge regime) has a borrowed dict scan — `start_matching_borrowed_dict`
1659    /// reads live matches from the borrowed input in place and dict matches
1660    /// from the committed dict prefix via the 2-segment counter. Every other
1661    /// backend, and copy-mode (large-input) dict frames, stay on the owned
1662    /// path. Checked AFTER priming, so `is_attached()` reflects the resolved
1663    /// attach-vs-copy decision.
1664    pub(crate) fn borrowed_dict_supported(&self) -> bool {
1665        matches!(
1666            &self.storage,
1667            MatcherStorage::Simple(m) if m.dict_is_attached()
1668        )
1669    }
1670
1671    fn simple_mut(&mut self) -> &mut FastKernelMatcher {
1672        match &mut self.storage {
1673            MatcherStorage::Simple(m) => m,
1674            _ => panic!("simple backend must be initialized by reset() before use"),
1675        }
1676    }
1677
1678    /// Reclaim the per-block input buffer that the Simple backend
1679    /// just spent inside `start_matching` / `skip_matching_with_hint`.
1680    ///
1681    /// `FastKernelMatcher::take_recycled_space` returns the cleared
1682    /// (capacity-retained) `Vec<u8>` from the last
1683    /// `extend_history_with_pending`. We push it onto `vec_pool`
1684    /// as-is (with `len = 0`); `get_next_space()` is responsible for
1685    /// resizing the buffer back to `slice_size` on its next pop. The
1686    /// pushed length is irrelevant — only the capacity matters, and
1687    /// `extend_history_with_pending` preserves it. Without this
1688    /// recycle path, the Simple backend would allocate a new
1689    /// `Vec<u8>` per block — a measurable hot-path cost when blocks
1690    /// are small (~128 KiB) and processed at hundreds of MiB/s.
1691    fn recycle_simple_space(&mut self) {
1692        if let Some(space) = self.simple_mut().take_recycled_space() {
1693            // `space` is already cleared (len = 0) by
1694            // `extend_history_with_pending`; capacity is retained.
1695            // Leaving `len = 0` here avoids the cost of zero-filling
1696            // the entire allocation — `get_next_space()` resizes the
1697            // popped buffer up to `slice_size` on demand, so the
1698            // length the pool holds is irrelevant. This matters most
1699            // after a small-source-size hint has shrunk `slice_size`
1700            // mid-frame: the recycled buffer can be much larger than
1701            // the current `slice_size`, and zero-filling 128 KiB+ on
1702            // every block would erase the perf win the recycle path
1703            // is meant to deliver.
1704            self.vec_pool.push(space);
1705        }
1706    }
1707
1708    /// Register a caller-owned input buffer as the Simple backend's
1709    /// borrowed one-shot match window. Only valid on the Simple (Fast)
1710    /// backend; the one-shot frame path gates on that before calling.
1711    ///
1712    /// # Safety
1713    /// Same contract as [`FastKernelMatcher::set_borrowed_window`]: the
1714    /// buffer must stay live and unmodified until the window is cleared,
1715    /// and must be cleared before the buffer is dropped or the matcher is
1716    /// reused for another frame.
1717    pub(crate) unsafe fn set_borrowed_window(&mut self, buffer: &[u8]) {
1718        // SAFETY: forwarded contract — caller upholds liveness/clear.
1719        match self.active_backend() {
1720            super::strategy::BackendTag::Simple => unsafe {
1721                self.simple_mut().set_borrowed_window(buffer)
1722            },
1723            super::strategy::BackendTag::Dfast => unsafe {
1724                self.dfast_matcher_mut().set_borrowed_window(buffer)
1725            },
1726            super::strategy::BackendTag::Row => unsafe {
1727                self.row_matcher_mut().set_borrowed_window(buffer)
1728            },
1729            super::strategy::BackendTag::HashChain => unsafe {
1730                self.hc_matcher_mut().set_borrowed_window(buffer)
1731            },
1732        }
1733    }
1734
1735    /// Clear the borrowed one-shot window, returning the active backend
1736    /// to the owned `history` path.
1737    pub(crate) fn clear_borrowed_window(&mut self) {
1738        match self.active_backend() {
1739            super::strategy::BackendTag::Simple => self.simple_mut().clear_borrowed_window(),
1740            super::strategy::BackendTag::Dfast => self.dfast_matcher_mut().clear_borrowed_window(),
1741            super::strategy::BackendTag::Row => self.row_matcher_mut().clear_borrowed_window(),
1742            super::strategy::BackendTag::HashChain => self.hc_matcher_mut().clear_borrowed_window(),
1743            #[allow(unreachable_patterns)]
1744            _ => {}
1745        }
1746        self.borrowed_pending = None;
1747    }
1748
1749    /// Stage the borrowed block range `[block_start, block_end)` for the
1750    /// NEXT `start_matching` / `skip_matching_with_hint`, which the
1751    /// borrowed Fast frame path uses in place of `commit_space`. While
1752    /// staged, those trait calls route to the Simple backend's borrowed
1753    /// scan/skip (consuming the stage) instead of the owned committed
1754    /// block. See [`Matcher::start_matching`] /
1755    /// [`Matcher::skip_matching_with_hint`] on this type.
1756    pub(crate) fn set_borrowed_block(&mut self, block_start: usize, block_end: usize) {
1757        assert!(
1758            self.borrowed_supported(),
1759            "borrowed block staging is not supported for the active backend/search config",
1760        );
1761        assert!(
1762            block_start <= block_end,
1763            "borrowed block range must satisfy start <= end (start={block_start} end={block_end})",
1764        );
1765        self.borrowed_pending = Some((block_start, block_end));
1766        // Make the range visible to `get_last_space()` immediately: the
1767        // emit pipeline reads `get_last_space().len()` in
1768        // `collect_block_parts` BEFORE `start_matching` consumes the
1769        // stage, so the staged block (not the whole borrowed window) must
1770        // be reported now to keep the literal-buffer reservation right.
1771        match self.active_backend() {
1772            super::strategy::BackendTag::Simple => self
1773                .simple_mut()
1774                .stage_borrowed_block(block_start, block_end),
1775            super::strategy::BackendTag::Dfast => self
1776                .dfast_matcher_mut()
1777                .stage_borrowed_block(block_start, block_end),
1778            super::strategy::BackendTag::Row => self
1779                .row_matcher_mut()
1780                .stage_borrowed_block(block_start, block_end),
1781            super::strategy::BackendTag::HashChain => self
1782                .hc_matcher_mut()
1783                .table
1784                .stage_borrowed_block(block_start, block_end),
1785        }
1786    }
1787
1788    #[cfg(test)]
1789    fn dfast_matcher(&self) -> &DfastMatchGenerator {
1790        match &self.storage {
1791            MatcherStorage::Dfast(m) => m,
1792            _ => panic!("dfast backend must be initialized by reset() before use"),
1793        }
1794    }
1795
1796    fn dfast_matcher_mut(&mut self) -> &mut DfastMatchGenerator {
1797        match &mut self.storage {
1798            MatcherStorage::Dfast(m) => m,
1799            _ => panic!("dfast backend must be initialized by reset() before use"),
1800        }
1801    }
1802
1803    #[cfg(test)]
1804    fn row_matcher(&self) -> &RowMatchGenerator {
1805        match &self.storage {
1806            MatcherStorage::Row(m) => m,
1807            _ => panic!("row backend must be initialized by reset() before use"),
1808        }
1809    }
1810
1811    fn row_matcher_mut(&mut self) -> &mut RowMatchGenerator {
1812        match &mut self.storage {
1813            MatcherStorage::Row(m) => m,
1814            _ => panic!("row backend must be initialized by reset() before use"),
1815        }
1816    }
1817
1818    #[cfg(test)]
1819    fn hc_matcher(&self) -> &HcMatchGenerator {
1820        match &self.storage {
1821            MatcherStorage::HashChain(m) => m,
1822            _ => panic!("hash chain backend must be initialized by reset() before use"),
1823        }
1824    }
1825
1826    fn hc_matcher_mut(&mut self) -> &mut HcMatchGenerator {
1827        match &mut self.storage {
1828            MatcherStorage::HashChain(m) => m,
1829            _ => panic!("hash chain backend must be initialized by reset() before use"),
1830        }
1831    }
1832
1833    /// Shrink the active backend's `max_window_size` by the bytes
1834    /// reclaimed from the dictionary-retention budget. Returns `true`
1835    /// iff any reclamation happened — the caller uses that as the
1836    /// gate for [`Self::trim_after_budget_retire`] (which is a no-op
1837    /// otherwise: with `max_window_size` unchanged the backend's
1838    /// `trim_to_window` cannot find anything to evict, so calling it
1839    /// just runs an extra `match` ladder + a single early-out check
1840    /// per slice commit).
1841    #[must_use]
1842    fn retire_dictionary_budget(&mut self, evicted_bytes: usize) -> bool {
1843        let reclaimed = evicted_bytes.min(self.dictionary_retained_budget);
1844        if reclaimed == 0 {
1845            return false;
1846        }
1847        self.dictionary_retained_budget -= reclaimed;
1848        match self.active_backend() {
1849            super::strategy::BackendTag::Simple => {
1850                let matcher = self.simple_mut();
1851                // `reclaimed` can exceed the CURRENT `max_window_size`: the
1852                // retained dict budget is tracked independently and the
1853                // window may already have been shrunk by a prior eviction,
1854                // so the floor at 0 is the correct clamp, not a masked bug.
1855                matcher.max_window_size = matcher.max_window_size.saturating_sub(reclaimed);
1856            }
1857            super::strategy::BackendTag::Dfast => {
1858                let matcher = self.dfast_matcher_mut();
1859                // `reclaimed` can exceed the CURRENT `max_window_size`: the
1860                // retained dict budget is tracked independently and the
1861                // window may already have been shrunk by a prior eviction,
1862                // so the floor at 0 is the correct clamp, not a masked bug.
1863                matcher.max_window_size = matcher.max_window_size.saturating_sub(reclaimed);
1864            }
1865            super::strategy::BackendTag::Row => {
1866                let matcher = self.row_matcher_mut();
1867                // `reclaimed` can exceed the CURRENT `max_window_size`: the
1868                // retained dict budget is tracked independently and the
1869                // window may already have been shrunk by a prior eviction,
1870                // so the floor at 0 is the correct clamp, not a masked bug.
1871                matcher.max_window_size = matcher.max_window_size.saturating_sub(reclaimed);
1872            }
1873            super::strategy::BackendTag::HashChain => {
1874                let matcher = self.hc_matcher_mut();
1875                // See the Simple arm: `reclaimed` may exceed the current
1876                // window, so saturating to 0 is the correct clamp.
1877                matcher.table.max_window_size =
1878                    matcher.table.max_window_size.saturating_sub(reclaimed);
1879            }
1880        }
1881        true
1882    }
1883
1884    fn trim_after_budget_retire(&mut self) {
1885        loop {
1886            let mut evicted_bytes = 0usize;
1887            match self.active_backend() {
1888                super::strategy::BackendTag::Simple => {
1889                    // FastKernelMatcher owns its history as a single
1890                    // flat `Vec<u8>` (upstream zstd's flat-buffer layout)
1891                    // rather than the legacy per-block `WindowEntry`
1892                    // stack. There are no per-block Vec allocations
1893                    // to recycle into `vec_pool` — `trim_to_window`
1894                    // drains the oldest bytes in-place and returns
1895                    // the count for the dictionary-budget loop's
1896                    // termination check.
1897                    let MatcherStorage::Simple(m) = &mut self.storage else {
1898                        unreachable!("active_backend() == Simple proven above");
1899                    };
1900                    evicted_bytes += m.trim_to_window();
1901                }
1902                super::strategy::BackendTag::Dfast => {
1903                    // Dfast doesn't retain input Vecs — `history` is the
1904                    // only byte store, so there is no per-block buffer
1905                    // to push back through a callback. Eviction byte
1906                    // count is derived from the `window_size` delta
1907                    // before/after; the Dfast variant of
1908                    // `trim_to_window` takes no closure, sidestepping
1909                    // an unused-`impl FnMut` monomorphization that
1910                    // would otherwise contractually never fire.
1911                    let dfast = self.dfast_matcher_mut();
1912                    let pre = dfast.window_size;
1913                    dfast.trim_to_window();
1914                    evicted_bytes += pre - dfast.window_size;
1915                }
1916                super::strategy::BackendTag::Row => {
1917                    // Row keeps bytes only in the contiguous `history` mirror
1918                    // (block buffers are returned to the pool per block in
1919                    // `add_data`), so derive the eviction count from the
1920                    // `window_size` delta, mirroring the Dfast / HashChain arms.
1921                    let row = self.row_matcher_mut();
1922                    let pre = row.window_size;
1923                    row.trim_to_window();
1924                    evicted_bytes += pre - row.window_size;
1925                }
1926                super::strategy::BackendTag::HashChain => {
1927                    // HC keeps bytes only in the contiguous `history` mirror
1928                    // (no per-block Vecs to recycle since the window<->history
1929                    // dedup), so derive the eviction count from the
1930                    // `window_size` delta, mirroring the Dfast arm above.
1931                    let table = &mut self.hc_matcher_mut().table;
1932                    let pre = table.window_size;
1933                    table.trim_to_window();
1934                    evicted_bytes += pre - table.window_size;
1935                }
1936            }
1937            if evicted_bytes == 0 {
1938                break;
1939            }
1940            // The loop's invariant is "the backend's previous
1941            // `max_window_size` shrink had downstream bytes left to
1942            // evict" — that's what `evicted_bytes != 0` proves at
1943            // this point. `dictionary_retained_budget` is NOT
1944            // guaranteed to be positive here: the outer
1945            // `retire_dictionary_budget` call may have already
1946            // drained it to zero by reclaiming the last retained
1947            // bytes, while the backend still has bytes above the
1948            // freshly-shrunk window cap waiting for this loop to
1949            // evict. The return value of the retire call below is
1950            // therefore intentionally discarded — the loop's
1951            // termination is driven by `evicted_bytes == 0`, not by
1952            // whether the budget has more bytes left to reclaim.
1953            let _ = self.retire_dictionary_budget(evicted_bytes);
1954        }
1955    }
1956
1957    /// ATTACH (`true`) vs COPY (`false`) decision for the dms-bearing HashChain
1958    /// backend (lazy hash-chain AND binary-tree/optimal levels), mirroring
1959    /// upstream `ZSTD_shouldAttachDict` and its per-strategy `attachDictSizeCutoffs`:
1960    /// a small / unknown source ATTACHES the dict as a separate dms (hash-chain
1961    /// dms for lazy, DUBT dms for BT); a large known source COPIES it into the
1962    /// live chain / tree. The cutoff is the lazy/lazy2 value for HC, the
1963    /// btlazy2/btopt value for Bt{Opt}, and the smaller btultra/btultra2 value for
1964    /// the deepest parses. Both `skip_matching_for_dictionary_priming` (which
1965    /// stages the dict) and `prime_with_dictionary` (which builds-or-drops the
1966    /// dms) read this so the two stay in lock-step.
1967    fn hc_dict_attach_mode(&self) -> bool {
1968        // Only the HashChain backend (lazy hash-chain + BT/optimal) routes here;
1969        // a non-HashChain storage has no dms decision, so default to attach.
1970        let MatcherStorage::HashChain(hc) = &self.storage else {
1971            return true;
1972        };
1973        let cutoff = if hc.table.uses_bt {
1974            match hc.strategy_tag {
1975                super::strategy::StrategyTag::BtUltra | super::strategy::StrategyTag::BtUltra2 => {
1976                    BT_ULTRA_ATTACH_DICT_CUTOFF_LOG
1977                }
1978                _ => BT_OPT_ATTACH_DICT_CUTOFF_LOG,
1979            }
1980        } else {
1981            HC_ATTACH_DICT_CUTOFF_LOG
1982        };
1983        self.reset_size_log.is_none_or(|log| log <= cutoff)
1984    }
1985
1986    fn skip_matching_for_dictionary_priming(&mut self) {
1987        match self.active_backend() {
1988            super::strategy::BackendTag::Simple => {
1989                // Upstream zstd `ZSTD_shouldAttachDict` mode selection for the Fast
1990                // strategy (cutoff 8 KB): small / unknown-size inputs ATTACH
1991                // (index dict positions into a SEPARATE immutable table; the
1992                // dual-probe 2-cursor `compress_block_fast_dict` then prefers
1993                // recent-input matches and falls back to the dict — the path
1994                // that wins small/unknown). Large known-size inputs COPY (prime
1995                // dict into the live table; the 4-cursor `compress_block_fast`
1996                // matches against it as window history — the path that already
1997                // matches/beats the upstream zstd on large corpora). The dispatch in
1998                // `start_matching` keys off `dict_table.is_some()`, which only
1999                // the attach path populates. See [`FAST_ATTACH_DICT_CUTOFF_LOG`].
2000                let attach = self.reset_dict_attach_ok
2001                    && self
2002                        .reset_size_log
2003                        .is_none_or(|log| log <= FAST_ATTACH_DICT_CUTOFF_LOG);
2004                if attach {
2005                    self.simple_mut().skip_matching_for_dict_prime();
2006                } else {
2007                    self.simple_mut().skip_matching_with_hint(Some(false));
2008                }
2009                self.recycle_simple_space();
2010            }
2011            super::strategy::BackendTag::Dfast => {
2012                // Upstream zstd `ZSTD_dictMatchState` mode selection for dfast (cutoff
2013                // 16 KiB): small / unknown-size inputs ATTACH (build the
2014                // separate immutable dict long+short tables; the dual-probe
2015                // `start_matching_fast_loop` searches live + dict, the path that
2016                // avoids the per-frame dict re-prime that dominates small
2017                // `compress-dict`). Larger known-size inputs COPY (re-prime the
2018                // dict into the live tables via `skip_matching_dense`, where the
2019                // dense scan matches it as window history). `skip_matching_for_dict_attach`
2020                // self-gates on `use_fast_loop` (only fast-loop levels carry the
2021                // dual-probe; general-path levels fall back to the dense copy).
2022                let attach = self
2023                    .reset_size_log
2024                    .is_none_or(|log| log <= DFAST_ATTACH_DICT_CUTOFF_LOG);
2025                if attach {
2026                    self.dfast_matcher_mut().skip_matching_for_dict_attach();
2027                } else {
2028                    self.dfast_matcher_mut().invalidate_dict_cache();
2029                    self.dfast_matcher_mut().skip_matching_dense();
2030                }
2031            }
2032            super::strategy::BackendTag::Row => {
2033                // Upstream zstd `ZSTD_RowFindBestMatch` `dictMatchState`: small /
2034                // unknown-size inputs ATTACH (build the separate immutable dict
2035                // row index; the bounded dual-probe in `row_candidate_rl`
2036                // searches live + dict, avoiding the per-frame dict re-index),
2037                // larger known-size inputs COPY (dense re-prime into the live
2038                // rows).
2039                let attach = self
2040                    .reset_size_log
2041                    .is_none_or(|log| log <= ROW_ATTACH_DICT_CUTOFF_LOG);
2042                if attach {
2043                    self.row_matcher_mut().prime_dict_attach_current_block();
2044                } else {
2045                    self.row_matcher_mut().invalidate_dict_cache();
2046                    self.row_matcher_mut().skip_matching_with_hint(Some(false));
2047                }
2048            }
2049            super::strategy::BackendTag::HashChain => {
2050                // Lazy-HC AND BT/optimal both follow upstream zstd `ZSTD_shouldAttachDict`
2051                // per-strategy: ATTACH (a separate dms — hash-chain dms for lazy,
2052                // DUBT dms for BT) for small / unknown inputs, COPY (merge the dict
2053                // into the live chain/tree) for large known inputs. ATTACH keeps
2054                // the dict in history but out of the live structure via
2055                // `skip_matching_dict_bt` (the cursor advance is shared by both
2056                // arms); COPY routes through the normal `skip_matching` (its
2057                // `uses_bt` branch fills the live tree, the lazy branch the live
2058                // chain). The dms is built-or-dropped to match in
2059                // `prime_with_dictionary`.
2060                if self.hc_dict_attach_mode() {
2061                    self.hc_matcher_mut().table.skip_matching_dict_bt();
2062                } else {
2063                    self.hc_matcher_mut().skip_matching(Some(false));
2064                }
2065            }
2066        }
2067    }
2068}
2069
2070impl Matcher for MatchGeneratorDriver {
2071    fn supports_dictionary_priming(&self) -> bool {
2072        true
2073    }
2074
2075    fn set_source_size_hint(&mut self, size: u64) {
2076        self.source_size_hint = Some(size);
2077    }
2078
2079    fn set_dictionary_size_hint(&mut self, size: usize) {
2080        self.dictionary_size_hint = Some(size);
2081    }
2082
2083    /// Dict-relevance gate for the raw-fast-path. Reached only when a dictionary
2084    /// is active (the caller short-circuits on `dict_active`), so this answers
2085    /// "could the dict compress this otherwise-incompressible-looking block?".
2086    /// The Simple (Fast) backend samples its dict table precisely
2087    /// ([`FastKernelMatcher::block_samples_match_dict`]); the other backends
2088    /// (Dfast / Row / HashChain / BT) have their own dict structures and no cheap
2089    /// probe here, so they answer CONSERVATIVELY `true`: without a probe they
2090    /// cannot tell whether the dict compresses an incompressible-LOOKING block,
2091    /// and answering `false` would let the raw-fast-path emit such a block raw
2092    /// and miss an embedded dict segment. `dictionary_segment_in_incompressible_input_is_matched`
2093    /// pins this for Dfast/Row/BT — the 512-byte dict run inside high-entropy
2094    /// filler is matched only because these backends stay on the scan. So they
2095    /// keep the blanket scan the old `!dict_active` gate gave them; only the
2096    /// Simple/Fast backend trades it for the precise probe.
2097    fn block_samples_match_dict(&self, block: &[u8]) -> bool {
2098        match &self.storage {
2099            MatcherStorage::Simple(m) => m.block_samples_match_dict(block),
2100            _ => true,
2101        }
2102    }
2103
2104    /// Heap bytes this driver owns: the active backend's tables/history, the
2105    /// recycled input-buffer pool, and the primed-dictionary snapshot (a cloned
2106    /// backend kept for CDict-equivalent reuse). The inline struct itself is
2107    /// accounted by the owner's `size_of`.
2108    fn heap_size(&self) -> usize {
2109        let pool: usize = self.vec_pool.capacity() * core::mem::size_of::<Vec<u8>>()
2110            + self.vec_pool.iter().map(Vec::capacity).sum::<usize>();
2111        let snapshot = self
2112            .primed
2113            .as_ref()
2114            .map_or(0, |(storage, _, _)| storage.heap_size());
2115        pool + self.storage.heap_size() + snapshot
2116    }
2117
2118    fn clear_param_overrides(&mut self) {
2119        self.param_overrides = None;
2120    }
2121
2122    fn reset(&mut self, level: CompressionLevel) {
2123        let hint = self.source_size_hint.take();
2124        let dict_hint = self.dictionary_size_hint.take();
2125        // Snapshot the hint's normalized ceil-log bucket for the primed-snapshot
2126        // key and prime_with_dictionary's attach/copy mode decision (the hint is
2127        // consumed here, but priming happens just after reset). Storing the
2128        // bucket rather than the raw bytes means two hints that resolve to the
2129        // same matcher shape share one snapshot instead of each re-priming.
2130        self.reset_size_log = hint.map(source_size_ceil_log);
2131        // A dictionary too large for the tagged attach position field falls back
2132        // to copy mode. Captured here (from the load-set size hint = actual dict
2133        // length) so the prime decision and the snapshot-key / epoch bits agree.
2134        self.reset_dict_attach_ok =
2135            dict_hint.is_none_or(|size| size <= MAX_FAST_ATTACH_DICT_REGION);
2136        let hinted = hint.is_some();
2137        #[cfg_attr(not(test), allow(unused_mut))]
2138        let mut params = Self::level_params(level, hint);
2139        // Test-only: apply a parse×search override so the matrix can be
2140        // exercised without editing `LEVEL_TABLE`. Mutating `params` here
2141        // (before `next_backend`) flows the override through storage
2142        // selection, `configure`, and the `self.search`/`self.parse`
2143        // writes uniformly. Consumed with `take()` so it is one-shot: the
2144        // synthetic pairing applies to exactly this `reset()`, and a later
2145        // reset on the same driver falls back to the level's real config.
2146        #[cfg(test)]
2147        if let Some((search, parse)) = self.config_override.take() {
2148            params.search = search;
2149            params.lazy_depth = parse.lazy_depth();
2150            // The matrix sweep can pair a level with a backend its native
2151            // row doesn't populate (e.g. greedy L5, which carries only `row`,
2152            // run on HashChain). Synthesize a default config for the
2153            // overridden backend so its `configure` arm has something to read.
2154            use super::strategy::SearchMethod;
2155            match search {
2156                SearchMethod::Fast => {
2157                    params.fast.get_or_insert(FAST_L1);
2158                }
2159                SearchMethod::DoubleFast => {
2160                    params.dfast.get_or_insert(DFAST_L3);
2161                }
2162                SearchMethod::RowHash => {
2163                    params.row.get_or_insert(ROW_CONFIG);
2164                }
2165                SearchMethod::HashChain | SearchMethod::BinaryTree => {
2166                    params.hc.get_or_insert(HC_CONFIG);
2167                }
2168            }
2169        }
2170        // Public-parameter overrides (#27): apply the per-knob set on top
2171        // of the level-resolved params. A strategy override re-routes the
2172        // backend, so this must precede `next_backend` selection. The
2173        // all-`None` case is skipped so default level geometry stays
2174        // byte-identical to plain level-based compression.
2175        if let Some(ov) = self.param_overrides
2176            && !ov.is_empty()
2177        {
2178            apply_param_overrides(&mut params, &ov);
2179            // `Self::level_params(level, hint)` applied the source-size cap
2180            // for the LEVEL's native backend. If a strategy override moved
2181            // the frame onto a different backend, `apply_param_overrides`
2182            // synthesized that backend's DEFAULT config (FAST_L1 /
2183            // HC_OVERRIDE_DEFAULT) with full-size table logs AFTER that cap
2184            // ran. Re-apply the hint cap so a tiny hinted frame doesn't
2185            // allocate the new backend's full-size tables. An explicit
2186            // `window_log` override is the user's hard request and must
2187            // survive the re-cap, so restore it afterwards.
2188            if let Some(hint_size) = hint {
2189                params = adjust_params_for_source_size(params, hint_size);
2190                if let Some(window_log) = ov.window_log {
2191                    params.window_log = window_log;
2192                }
2193            }
2194        }
2195        // Dictionary-driven table sizing — parity with upstream zstd `ZSTD_createCDict`
2196        // (`ZSTD_getCParams_internal(level, UNKNOWN, dictSize, ZSTD_cpm_createCDict)`
2197        // → `ZSTD_adjustCParams_internal`). A loaded dictionary supplies the
2198        // long-distance matches, so upstream zstd sizes the prepared match-finder tables
2199        // to the DICTIONARY (assuming a `minSrcSize` source), not the live
2200        // window: it downsizes `hashLog`/`chainLog` toward the dict-and-window
2201        // log while leaving the frame's eviction `window_log` source-derived so
2202        // the dictionary bytes stay referenceable (`ZSTD_resetCCtx_byCopyingCDict`
2203        // copies the small CDict tables but keeps the source window). We apply
2204        // the same downsizing to the level's own hc geometry and cap (min) so a
2205        // dict never inflates the level tables. Only the binary-tree / hash-chain
2206        // backend reads `hc.{hash,chain}_log`; Simple/Dfast/Row derive their
2207        // widths from the source window in their `reset` arms.
2208        // A zero-length dictionary is "no dictionary": running the CDict sizing
2209        // path for `Some(0)` is not a no-op — `cdict_table_logs(.., 0)` still
2210        // collapses the HC/BT tables toward the 513-byte upstream zstd tier via
2211        // `DICT_MIN_SRC_SIZE`, tanking ratio/perf on the next frame. Priming
2212        // already treats empty content as empty, so skip the downsizing here too.
2213        if let Some(dict_size) = dict_hint.filter(|&size| size > 0) {
2214            // Derive the dict-tier geometry from the level's FULL (un-source-capped)
2215            // hc widths. `Self::level_params(level, hint)` already source-capped
2216            // `params.hc`; feeding those capped widths into `cdict_table_logs` and
2217            // then `.min()`-ing would double-cap, so on a small hinted source with a
2218            // large dictionary the prepared tables collapse below what the dict needs
2219            // — defeating the `ZSTD_createCDict` geometry this mirrors. Take the
2220            // un-hinted base widths instead and assign the result directly:
2221            // `cdict_table_logs` only ever downsizes, so it never exceeds the base
2222            // level geometry, while the eviction `window_log` stays source-derived so
2223            // the dictionary bytes remain referenceable. Active public-parameter
2224            // overrides (#27) are applied to the base too, so a strategy override
2225            // that routes onto HashChain/BinaryTree still gets dict-tier sizing and
2226            // explicit hash/chain overrides feed through as the geometry ceiling.
2227            let mut base_params = Self::level_params(level, None);
2228            if let Some(ov) = self.param_overrides
2229                && !ov.is_empty()
2230            {
2231                apply_param_overrides(&mut base_params, &ov);
2232            }
2233            if let (Some(hc), Some(base_hc)) = (params.hc.as_mut(), base_params.hc) {
2234                let uses_bt = matches!(
2235                    params.strategy_tag,
2236                    super::strategy::StrategyTag::Btlazy2
2237                        | super::strategy::StrategyTag::BtOpt
2238                        | super::strategy::StrategyTag::BtUltra
2239                        | super::strategy::StrategyTag::BtUltra2
2240                );
2241                let (dict_hash_log, dict_chain_log) = cdict_table_logs(
2242                    params.window_log,
2243                    base_hc.hash_log,
2244                    base_hc.chain_log,
2245                    uses_bt,
2246                    dict_size,
2247                );
2248                hc.hash_log = dict_hash_log;
2249                hc.chain_log = dict_chain_log;
2250            }
2251        }
2252        // upstream zstd `ZSTD_resolveRowMatchFinderMode` (zstd_compress.c:238-245):
2253        // the row matchfinder is used for greedy/lazy/lazy2 ONLY when
2254        // `windowLog > 14`; at or below that upstream runs the hash-chain
2255        // matcher (`ZSTD_HcFindBestMatch`). We previously hardcoded the Row
2256        // backend for these strategies regardless of window, sending every
2257        // small-window frame (hinted floor = windowLog 14, e.g. the small-4k/10k
2258        // fixtures) through Row where upstream uses HC. Match it: fall back to
2259        // the hash-chain matcher (lazy/greedy parse via `lazy_depth`) when the
2260        // resolved window is <= 14. The HC config is synthesised from the
2261        // level's RowConfig (HC and Row share the same cParams; only the
2262        // matchfinder differs) — `hash_log` / `chain_log` are
2263        // clamped to the (<= 14) window inside the HashChain reset arm, so the
2264        // nominal width here only sets the clamp ceiling.
2265        if params.search == super::strategy::SearchMethod::RowHash && params.window_log <= 14 {
2266            let row = params
2267                .row
2268                .expect("a RowHash level row must carry a RowConfig");
2269            params.search = super::strategy::SearchMethod::HashChain;
2270            // For a dict-bearing frame, downsize the synthesised HC logs to the
2271            // dictionary's content tier via `cdict_table_logs` (the same
2272            // correction the native HC dict-prime path applies above), so a dict
2273            // much smaller than the window doesn't prime a needlessly sparse
2274            // table. Row-finder levels are never BinaryTree, so `uses_bt = false`.
2275            //
2276            // Feed `cdict_table_logs` the UN-hinted base Row width, not the
2277            // resolved `row.hash_bits`: the latter is already source-capped on a
2278            // hinted reset (the `row_cap = table_log + 1` clamp), so passing it
2279            // here would double-cap exactly as the native HC dict path warns
2280            // above — a small hinted source with a large dictionary would
2281            // collapse the prepared table below what the dict needs.
2282            // `cdict_table_logs` only ever downsizes, so deriving the ceiling
2283            // from the un-hinted base (plus active public overrides) keeps the
2284            // dict-tier geometry intact. No source hint => `row.hash_bits` is
2285            // already the level's full width, so reuse it directly.
2286            let row_cdict_hash_bits = match dict_hint.filter(|&size| size > 0) {
2287                Some(_) => {
2288                    let mut base_params = Self::level_params(level, None);
2289                    if let Some(ov) = self.param_overrides
2290                        && !ov.is_empty()
2291                    {
2292                        apply_param_overrides(&mut base_params, &ov);
2293                    }
2294                    base_params
2295                        .row
2296                        .map_or(row.hash_bits, |base_row| base_row.hash_bits)
2297                }
2298                None => row.hash_bits,
2299            };
2300            // Row-backed levels carry only `hash_bits`; the HC chain table they
2301            // fall back to follows the upstream zstd cParams relationship `chainLog =
2302            // hashLog - 1` for every Row level (L6 c18 h19 .. L12 c22 h23, see
2303            // the ROW_L* tables). Synthesise the chain width as `hash_bits - 1`
2304            // so the dict path doesn't leave the chain table one bit too wide
2305            // (cdict_table_logs only downsizes, so passing the full hash width
2306            // for both would keep a 2x-too-large chain table on dict frames).
2307            // Raw `- 1` is underflow-safe: `hash_bits` is either a predefined
2308            // ROW_L* width (>= 19) or a public `hash_log` override, and the
2309            // override is range-validated to `ZSTD_HASHLOG_MIN = 6` at the
2310            // parameter API, so the value is always >= 6 here.
2311            //
2312            // A public `chain_log` override (#27) is dropped by the RowHash
2313            // override arm (Row has no chain table), but once this frame falls
2314            // back to HC the chain table is live and must honour it — mirror
2315            // the native HC dict path, which feeds the override-applied
2316            // `base_hc.chain_log` into `cdict_table_logs`. Use the explicit
2317            // override (also API-validated to ZSTD_CHAINLOG_MIN = 6) when set,
2318            // else the upstream zstd `hashLog - 1` relationship.
2319            let explicit_chain_log = self
2320                .param_overrides
2321                .filter(|ov| !ov.is_empty())
2322                .and_then(|ov| ov.chain_log)
2323                .map(|chain_log| chain_log as usize);
2324            let row_cdict_chain_bits = explicit_chain_log.unwrap_or(row_cdict_hash_bits - 1);
2325            let (mut hash_log, mut chain_log) = match dict_hint.filter(|&size| size > 0) {
2326                Some(dict_size) => cdict_table_logs(
2327                    params.window_log,
2328                    row_cdict_hash_bits,
2329                    row_cdict_chain_bits,
2330                    false,
2331                    dict_size,
2332                ),
2333                None => (
2334                    row.hash_bits,
2335                    explicit_chain_log.unwrap_or(row.hash_bits - 1),
2336                ),
2337            };
2338            // No-dict path: the HashChain reset arm only clamps the logs to the
2339            // window when `hinted`, but a public `window_log` override can lower
2340            // this level to <= 14 with no source hint — clamp the level's full
2341            // Row `hash_bits` to the window here too (upstream zstd `ZSTD_adjustCParams`:
2342            // hashLog <= windowLog + 1, chainLog <= windowLog) so a 16 KiB window
2343            // doesn't allocate Row-sized HC tables.
2344            if dict_hint.filter(|&size| size > 0).is_none() {
2345                let wlog = params.window_log as usize;
2346                hash_log = hash_log.min(wlog + 1);
2347                chain_log = chain_log.min(wlog);
2348            }
2349            params.hc = Some(HcConfig {
2350                hash_log,
2351                chain_log,
2352                search_depth: row.search_depth,
2353                target_len: row.target_len,
2354                search_mls: 4,
2355            });
2356            params.row = None;
2357        }
2358        let next_backend = params.backend();
2359        let max_window_size = 1usize << params.window_log;
2360        self.dictionary_retained_budget = 0;
2361        // Drop any frame-local borrowed staging so it can't leak across a
2362        // reset and misroute the next start/skip into borrowed dispatch.
2363        self.borrowed_pending = None;
2364        if self.active_backend() != next_backend {
2365            // Drain the outgoing backend's allocations into the shared
2366            // pool. The `match &mut self.storage { ... }` block runs to
2367            // completion before the assignment below replaces the
2368            // variant, so the inner state we just drained is dropped
2369            // with the old variant.
2370            match &mut self.storage {
2371                MatcherStorage::Simple(_m) => {
2372                    // FastKernelMatcher owns a flat Vec<u8> history
2373                    // and a Vec<u32> hash table — both drop with the
2374                    // variant assignment below, no per-block buffers
2375                    // to recycle into the driver pools. The
2376                    // assignment-replace path collapses to a noop
2377                    // pre-pass for this backend.
2378                }
2379                MatcherStorage::Dfast(m) => {
2380                    // Drop the long / short hash table allocations
2381                    // before calling `m.reset`. Without this prepass,
2382                    // `DfastMatchGenerator::reset` would `fill` both
2383                    // tables with `DFAST_EMPTY_SLOT` sentinels — wasted
2384                    // work given the next assignment to `self.storage`
2385                    // is about to drop `m` entirely. `reset` itself
2386                    // short-circuits on `if !self.tables.is_empty()`, so
2387                    // handing it an empty `Vec` skips the fill loop.
2388                    // Mirrors the pre-drain pattern in the HashChain
2389                    // arm below (and serves the same peak-memory
2390                    // purpose: release the table-allocation footprint
2391                    // before constructing the replacement variant).
2392                    m.tables = Vec::new();
2393                    m.reset();
2394                }
2395                MatcherStorage::Row(m) => {
2396                    m.row_heads = Vec::new();
2397                    m.row_positions = Vec::new();
2398                    m.row_tags = Vec::new();
2399                    m.reset();
2400                }
2401                MatcherStorage::HashChain(m) => {
2402                    // Release oversized tables when switching away from
2403                    // HashChain so Best's larger allocations don't persist.
2404                    // hash3_table must be released alongside the other
2405                    // two: BtUltra2's `1 << HC3_HASH_LOG` entries would
2406                    // otherwise stay pinned across the backend switch,
2407                    // even though no future caller of this backend will
2408                    // touch them.
2409                    m.table.hash_table = Vec::new();
2410                    m.table.chain_table = Vec::new();
2411                    m.table.hash3_table = Vec::new();
2412                    let vec_pool = &mut self.vec_pool;
2413                    m.reset(|mut data| {
2414                        data.resize(data.capacity(), 0);
2415                        vec_pool.push(data);
2416                    });
2417                }
2418            }
2419            // Swap in a fresh variant for the new backend. The previous
2420            // `storage` is dropped here.
2421            self.storage = match next_backend {
2422                super::strategy::BackendTag::Simple => {
2423                    // Per-level Fast cParams from resolve_level_params:
2424                    // Level(1) gets (hash_log=14, mls=7); Level(-7..=-1)
2425                    // get upstream zstd row-0 (hash_log=13, mls=7); Fastest /
2426                    // Uncompressed keep (hash_log=14, mls=6). See
2427                    // resolve_level_params for rationale.
2428                    let fast = params.fast.expect("Fast level row carries a FastConfig");
2429                    MatcherStorage::Simple(FastKernelMatcher::with_params(
2430                        params.window_log,
2431                        fast.hash_log,
2432                        fast.mls,
2433                        fast.step_size,
2434                    ))
2435                }
2436                super::strategy::BackendTag::Dfast => {
2437                    MatcherStorage::Dfast(DfastMatchGenerator::new(max_window_size))
2438                }
2439                super::strategy::BackendTag::Row => {
2440                    MatcherStorage::Row(RowMatchGenerator::new(max_window_size))
2441                }
2442                super::strategy::BackendTag::HashChain => {
2443                    MatcherStorage::HashChain(HcMatchGenerator::new(max_window_size))
2444                }
2445            };
2446        }
2447
2448        // Single source of truth: `LevelParams::strategy_tag` is the
2449        // authoritative mapping from `CompressionLevel` to strategy.
2450        // `storage.backend()` derives the parse family from the variant,
2451        // so there is no separate runtime tag that could drift against
2452        // `LEVEL_TABLE`.
2453        self.strategy_tag = params.strategy_tag;
2454        self.search = params.search;
2455        self.parse = params.parse();
2456        self.slice_size = self.base_slice_size.min(max_window_size);
2457        self.reported_window_size = max_window_size;
2458        let strategy_tag = self.strategy_tag;
2459        // Source-proportional table window for the backends whose hash-table
2460        // widths are recomputed here (Dfast / Row). Like the HC / Fast caps
2461        // in `adjust_params_for_source_size`, this sizes the internal tables
2462        // from the RAW source log (not the wire `window_log` floor) so a
2463        // small frame zeroes a small table; it never exceeds the real window.
2464        let table_window_size = match hint {
2465            Some(h) => {
2466                let raw_log = source_size_ceil_log(h);
2467                // Clamp the shift below the pointer width before `1usize <<`:
2468                // an oversized hint (>= 2^63 + 1, and on 32-bit usize any hint
2469                // >= 2^32) drives `raw_log` to 64 / >= 32, and the shift would
2470                // overflow (panic in debug, wrap to 0 in release) before the
2471                // `.min(max_window_size)` cap below could bound it. The min cap
2472                // still provides the real semantic window bound.
2473                let shift = raw_log.max(MIN_WINDOW_LOG).min(usize::BITS as u8 - 1);
2474                (1usize << shift).min(max_window_size)
2475            }
2476            None => max_window_size,
2477        };
2478        // The hint-dependent hash-table width the active backend applies, for
2479        // the primed-snapshot key. Dfast/Row compute it from `table_window_size`
2480        // below; HC/Fast leave it `0` because their widths live in `params`
2481        // (`hc.{hash,chain}_log` / `fast_hash_log`) — already part of the key.
2482        let mut resolved_table_bits: usize = 0;
2483        match &mut self.storage {
2484            MatcherStorage::Simple(m) => {
2485                // Per-level Fast cParams threaded from
2486                // resolve_level_params (see Simple-backend swap
2487                // arm above for the (level → params) mapping).
2488                let fast = params.fast.expect("Fast level row carries a FastConfig");
2489                // Same attach/copy split the dict-prime dispatch applies
2490                // below (`prime_with_dictionary`): only attach-mode dict
2491                // frames may keep the main table across the reset via an
2492                // epoch advance — copy-mode and no-dict frames must memset
2493                // it back to bias 0 for the raw-slice kernels.
2494                // `Some(0)` is "no dictionary" (the dict-sizing path above
2495                // filters it the same way): an empty dict primes nothing, so
2496                // an epoch-advance reset would preserve stale attach state
2497                // instead of clearing it.
2498                let dict_attach_epoch = matches!(dict_hint, Some(size) if size > 0)
2499                    && self.reset_dict_attach_ok
2500                    && self
2501                        .reset_size_log
2502                        .is_none_or(|log| log <= FAST_ATTACH_DICT_CUTOFF_LOG);
2503                // Copy-mode dictionary frame whose primed snapshot matches
2504                // this exact resolved shape: `restore_primed_dictionary`
2505                // (called right after this reset; the caller gates the
2506                // restore on the same size bucket and the restore re-checks
2507                // the same key) will `clone_from` the snapshot over this
2508                // matcher, replacing the table contents and bias wholesale —
2509                // the reset's full-table memset would be thrown away. The
2510                // key components mirror `reset_shape` below: Simple leaves
2511                // `resolved_table_bits` 0, never carries an LDM override,
2512                // and `fast_attach` is false in copy mode by construction.
2513                let table_overwritten_by_restore = matches!(dict_hint, Some(size) if size > 0)
2514                    && !dict_attach_epoch
2515                    && self.primed.as_ref().is_some_and(|(_, _, captured)| {
2516                        *captured
2517                            == PrimedKey {
2518                                level,
2519                                params,
2520                                table_bits: 0,
2521                                fast_attach: false,
2522                                ldm: None,
2523                            }
2524                    });
2525                // Cap `hash_log <= window_log + 1` (upstream zstd
2526                // `ZSTD_adjustCParams_internal`): once `window_log` is resized
2527                // down for a small source, a level-default `1 << hash_log`
2528                // table is mostly wasted address space whose per-frame memset
2529                // dominates the compress cost on tiny frames (a 4 KB frame at
2530                // window_log 12 still zero-fills the 64 KiB hash_log-14 table).
2531                // Gated to no-dict frames: the dict-attach path shares one
2532                // hash_log between the main and dict tables (so one hash keys
2533                // both), and shrinking only the main table would break that
2534                // invariant and the small-frame dict ratio.
2535                let hash_log = if dict_hint.is_some_and(|s| s > 0) {
2536                    fast.hash_log
2537                } else {
2538                    fast.hash_log.min(params.window_log as u32 + 1)
2539                };
2540                m.reset(
2541                    params.window_log,
2542                    hash_log,
2543                    fast.mls,
2544                    fast.step_size,
2545                    dict_attach_epoch,
2546                    table_overwritten_by_restore,
2547                );
2548            }
2549            MatcherStorage::Dfast(dfast) => {
2550                dfast.max_window_size = max_window_size;
2551                let dcfg = params
2552                    .dfast
2553                    .expect("Dfast level row must carry a DfastConfig");
2554                // Upstream zstd `cParams.hashLog`/`chainLog`, capped by the
2555                // source-size window when hinted so tiny inputs don't
2556                // over-allocate.
2557                let long_bits = if hinted {
2558                    dfast_hash_bits_for_window(table_window_size).min(dcfg.long_hash_log as usize)
2559                } else {
2560                    dcfg.long_hash_log as usize
2561                };
2562                let short_bits = if hinted {
2563                    dfast_hash_bits_for_window(table_window_size).min(dcfg.short_hash_log as usize)
2564                } else {
2565                    dcfg.short_hash_log as usize
2566                };
2567                resolved_table_bits = long_bits;
2568                dfast.set_hash_bits(long_bits, short_bits);
2569                // Dfast holds no per-block input Vecs (history owns the
2570                // bytes and `add_data` returns each Vec eagerly), so
2571                // `reset` takes no `reuse_space` callback.
2572                dfast.reset();
2573            }
2574            MatcherStorage::Row(row) => {
2575                row.max_window_size = max_window_size;
2576                row.lazy_depth = params.lazy_depth;
2577                let mut row_cfg = params.row.expect("Row level row carries a RowConfig");
2578                if hinted {
2579                    // Clamp the configured hash width by the hinted window
2580                    // (upstream zstd `ZSTD_adjustCParams` caps hashLog by windowLog) —
2581                    // `min`, not replace, so an explicit `hash_log` param
2582                    // override (`row_cfg.hash_bits`) survives the hinted path
2583                    // instead of being overwritten by the window value.
2584                    //
2585                    // Clamp BEFORE `configure` so the backend sees ONE width
2586                    // per frame. Configuring with the unclamped level width
2587                    // and then re-clamping made `row_hash_log` oscillate on
2588                    // every hinted frame, and each width change clears the
2589                    // row tables — `ensure_tables` then re-filled all three
2590                    // every frame in a reused compressor.
2591                    row_cfg.hash_bits = row_cfg
2592                        .hash_bits
2593                        .min(row_hash_bits_for_window(table_window_size));
2594                }
2595                row.configure(row_cfg);
2596                // Key the primed snapshot on the width the backend ACTUALLY
2597                // applied (`set_hash_bits` clamps the request): recording the
2598                // request — or the 0 default on the unhinted path — keys
2599                // identical table geometries apart and forces needless
2600                // dictionary re-primes.
2601                resolved_table_bits = row.hash_bits();
2602                row.reset();
2603            }
2604            MatcherStorage::HashChain(hc) => {
2605                hc.table.max_window_size = max_window_size;
2606                hc.hc.lazy_depth = params.lazy_depth;
2607                let mut hc_cfg = params.hc.expect("HashChain level row carries an HcConfig");
2608                // Cap the hash / chain table logs by the hinted window so a small
2609                // input doesn't allocate the full level's tables (the upstream zstd
2610                // `ZSTD_adjustCParams_internal` clamp: `hashLog <= windowLog + 1`,
2611                // and `cycleLog <= windowLog` — `cycleLog == chainLog` for the HC
2612                // finder, `chainLog - 1` for the BT pair table, so `chainLog <=
2613                // windowLog` (+1 for BT)). Ratio-neutral: a hinted window of
2614                // `2^wlog` bytes holds at most `2^wlog` positions, so the slots
2615                // beyond that are never populated — capping only sheds unused
2616                // allocation. Was the source of L10-lazy peak-alloc ~2.15x the
2617                // upstream zstd on a 1 MiB input. Only applied when hinted; an
2618                // unknown-size stream keeps the full level tables.
2619                // Skip for dict-bearing frames: their `hc_cfg.{hash,chain}_log`
2620                // were already sized to the dictionary content tier via
2621                // `cdict_table_logs` (the dict supplies the long-distance
2622                // matches, so upstream `ZSTD_createCDict` sizes the prepared
2623                // tables to the dict, not the source window). Re-applying the
2624                // source-window cap here would collapse those dict-tier logs
2625                // back to the small hinted source — the same double-cap the
2626                // synthesis sites avoid by using the un-hinted base width.
2627                if hinted && !matches!(dict_hint, Some(size) if size > 0) {
2628                    let wlog = hc_hash_bits_for_window(table_window_size);
2629                    let uses_bt = matches!(
2630                        strategy_tag,
2631                        super::strategy::StrategyTag::Btlazy2
2632                            | super::strategy::StrategyTag::BtOpt
2633                            | super::strategy::StrategyTag::BtUltra
2634                            | super::strategy::StrategyTag::BtUltra2
2635                    );
2636                    hc_cfg.hash_log = hc_cfg.hash_log.min(wlog + 1);
2637                    hc_cfg.chain_log = hc_cfg.chain_log.min(if uses_bt { wlog + 1 } else { wlog });
2638                }
2639                hc.configure(hc_cfg, strategy_tag, params.window_log);
2640                let vec_pool = &mut self.vec_pool;
2641                hc.reset(|mut data| {
2642                    data.resize(data.capacity(), 0);
2643                    vec_pool.push(data);
2644                });
2645                // When the source size is known, pre-size the history mirror to
2646                // the expected total (dictionary + payload) so per-block growth
2647                // does not overshoot via Vec capacity doubling (upstream zstd sizes its
2648                // window buffer exactly). Dominates peak once the match-finder
2649                // tables are dictionary-tier-small. Unhinted streams skip this
2650                // and keep doubling growth.
2651                if let Some(src) = hint {
2652                    // `src` is a u64 hint and may be the u64::MAX "unknown
2653                    // size" sentinel, which truncates under `as usize` on
2654                    // 32-bit targets and overflows when the dict hint is
2655                    // added. Saturate the source size, then saturate the
2656                    // dict-hint addition; `reserve_history` applies the
2657                    // tighter window ceiling to the result.
2658                    let src_hint = usize::try_from(src).unwrap_or(usize::MAX);
2659                    let expected = src_hint.saturating_add(dict_hint.unwrap_or(0));
2660                    hc.table.reserve_history(expected);
2661                }
2662            }
2663        }
2664        // LDM wiring (#27): attach (or clear) the long-distance-match
2665        // producer on the optimal (BT) backend. LDM is the only
2666        // back-reference path that crosses the regular window, so it
2667        // only has a home on the `BtMatcher`; non-BT strategies drop the
2668        // producer. Built AFTER `hc.reset()` because `BtMatcher::reset`
2669        // clears an existing producer's table but does not null the
2670        // slot — installing here gives the new frame a fresh producer.
2671        #[cfg(feature = "hash")]
2672        {
2673            // Resolve the derived LDM params first (immutable borrow of the
2674            // overrides), then reuse the existing producer's allocation below.
2675            let derived_ldm = self
2676                .param_overrides
2677                .as_ref()
2678                .and_then(|ov| ov.ldm)
2679                .map(|ldm_ov| {
2680                    let strategy_ord = ldm_strategy_ordinal(params.strategy_tag, params.lazy_depth);
2681                    // Seed the caller-pinned knobs, then run the upstream zstd
2682                    // derivation over the seed so the remaining (zero)
2683                    // fields are filled with cross-field consistency
2684                    // (e.g. `hash_rate_log = window_log - hash_log`).
2685                    // Clobbering after `adjust_for` would break that and
2686                    // hand the producer an inconsistent set.
2687                    let seed = super::ldm::params::LdmParams {
2688                        window_log: params.window_log as u32,
2689                        hash_log: ldm_ov.hash_log.unwrap_or(0),
2690                        hash_rate_log: ldm_ov.hash_rate_log.unwrap_or(0),
2691                        min_match_length: ldm_ov.min_match.unwrap_or(0),
2692                        bucket_size_log: ldm_ov.bucket_size_log.unwrap_or(0),
2693                    };
2694                    seed.derive(strategy_ord)
2695                });
2696            if let MatcherStorage::HashChain(hc) = &mut self.storage {
2697                // Reuse the existing producer's hash-table allocation when the
2698                // derived params are unchanged: only `clear()` (re-zero the
2699                // table + re-seed the rolling hash, no allocation) is needed for
2700                // the new frame. A params change (or the first frame) forces a
2701                // fresh `LdmProducer::new`. On the reused-encoder compress-dict
2702                // path this avoids re-allocating the LDM hash table (large at
2703                // btultra2) every frame — upstream zstd reuses its `ldmState_t`
2704                // the same way. `clear()` is mandatory here for correctness
2705                // regardless of what `BtMatcher::reset` did to the old table.
2706                let producer = derived_ldm.map(|p| match hc.take_ldm_producer() {
2707                    Some(mut existing) if existing.params() == p => {
2708                        existing.clear();
2709                        existing
2710                    }
2711                    _ => super::ldm::LdmProducer::new(p),
2712                });
2713                hc.set_ldm_producer(producer);
2714            }
2715        }
2716        // Record the resolved matcher shape for the primed-snapshot key. Captured
2717        // here (post-resolution, after the test-only param override) so the key
2718        // reflects exactly the geometry the restored `storage` must match. The
2719        // Fast attach-vs-copy mode is part of the shape ONLY for the Simple
2720        // backend (it decides the distinct dict-table shape that backend builds).
2721        // Dfast/Row/HashChain have their OWN attach/copy regimes, but this bit
2722        // models only the Fast table split; those backends are keyed by the
2723        // resolved matcher geometry instead, so folding the Fast bit into their
2724        // key would over-key identical resolved shapes. When it applies it
2725        // matches the decision `prime_with_dictionary` makes from the same
2726        // `reset_size_log`.
2727        let fast_attach = matches!(next_backend, super::strategy::BackendTag::Simple)
2728            && self.reset_dict_attach_ok
2729            && self
2730                .reset_size_log
2731                .is_none_or(|log| log <= FAST_ATTACH_DICT_CUTOFF_LOG);
2732        // The LDM override is part of the snapshot identity ONLY on the
2733        // optimal (BinaryTree) path: that is the only backend whose cloned
2734        // `storage` carries a `BtMatcher::ldm_producer`. On Fast / Dfast /
2735        // Row and lazy-HashChain resets the producer slot does not exist,
2736        // so folding the override there would over-key the snapshot and
2737        // force needless re-primes when LDM is toggled. Gated like
2738        // `fast_attach` (a key bit only participates where it changes the
2739        // cloned matcher shape).
2740        let active_ldm = if matches!(params.search, super::strategy::SearchMethod::BinaryTree) {
2741            self.param_overrides.and_then(|ov| ov.ldm)
2742        } else {
2743            None
2744        };
2745        self.reset_shape = Some((params, resolved_table_bits, fast_attach, active_ldm));
2746    }
2747
2748    fn dictionary_is_resident(&self) -> bool {
2749        match &self.storage {
2750            MatcherStorage::HashChain(hc) => hc.table.dict_resident,
2751            MatcherStorage::Simple(s) => s.dict_resident(),
2752            MatcherStorage::Dfast(d) => d.dict_resident(),
2753            _ => false,
2754        }
2755    }
2756
2757    fn reapply_resident_dictionary(&mut self, offset_hist: [u32; 3]) {
2758        // Same offset-history head as `prime_with_dictionary`, without the dict
2759        // commit / re-index (resident dict bytes + cached dms already in place).
2760        match self.active_backend() {
2761            super::strategy::BackendTag::Simple => {
2762                self.simple_mut().prime_offset_history(offset_hist)
2763            }
2764            super::strategy::BackendTag::Dfast => {
2765                self.dfast_matcher_mut().offset_hist = offset_hist
2766            }
2767            super::strategy::BackendTag::Row => self.row_matcher_mut().offset_hist = offset_hist,
2768            super::strategy::BackendTag::HashChain => {
2769                let matcher = self.hc_matcher_mut();
2770                matcher.table.offset_hist = offset_hist;
2771                matcher.table.mark_dictionary_primed();
2772            }
2773        }
2774        // Restore the retained-dictionary budget the per-frame `reset` cleared.
2775        // The matcher's `reset` re-inflated `max_window_size` by the resident
2776        // dict region (so the dict + next input both stay in the eviction band),
2777        // exactly as `prime_with_dictionary` does — but the resident path skips
2778        // that prime, so without this the driver-level budget stays 0 and
2779        // `retire_dictionary_budget` never shrinks the inflated window as input
2780        // evicts the dict. For HashChain (whose `window_low` is measured against
2781        // `max_window_size`), a stuck-inflated window would let a post-eviction
2782        // match exceed the frame header's base window and emit an over-window
2783        // offset. The inflation equals `max_window_size - base`, and
2784        // `reported_window_size` is the base `1 << window_log` set by `reset`.
2785        let base = self.reported_window_size;
2786        let inflated = match self.active_backend() {
2787            super::strategy::BackendTag::Simple => self.simple_mut().max_window_size,
2788            super::strategy::BackendTag::Dfast => self.dfast_matcher_mut().max_window_size,
2789            super::strategy::BackendTag::Row => self.row_matcher_mut().max_window_size,
2790            super::strategy::BackendTag::HashChain => self.hc_matcher_mut().table.max_window_size,
2791        };
2792        self.dictionary_retained_budget = inflated.saturating_sub(base);
2793    }
2794
2795    fn prime_with_dictionary(&mut self, dict_content: &[u8], offset_hist: [u32; 3]) {
2796        match self.active_backend() {
2797            super::strategy::BackendTag::Simple => {
2798                // Routes through prime_offset_history so BOTH
2799                // offset_hist (wire encoder) and rep[0..2] (kernel)
2800                // are updated atomically. Without this, the two
2801                // tracks drift after dict priming — kernel emits
2802                // repcode matches against stale FAST_INITIAL_REP
2803                // while the wire encoder uses the primed history,
2804                // producing divergent wire encoding (Copilot review
2805                // #15 on #216).
2806                self.simple_mut().prime_offset_history(offset_hist);
2807            }
2808            super::strategy::BackendTag::Dfast => {
2809                self.dfast_matcher_mut().offset_hist = offset_hist
2810            }
2811            super::strategy::BackendTag::Row => self.row_matcher_mut().offset_hist = offset_hist,
2812            super::strategy::BackendTag::HashChain => {
2813                let matcher = self.hc_matcher_mut();
2814                matcher.table.offset_hist = offset_hist;
2815                matcher.table.mark_dictionary_primed();
2816            }
2817        }
2818
2819        if dict_content.is_empty() {
2820            return;
2821        }
2822
2823        // Dictionary bytes should stay addressable until produced frame output
2824        // itself exceeds the live window size. We bump `max_window_size`
2825        // by the dictionary length so the eviction band keeps the
2826        // primed bytes in `history`.
2827        //
2828        // Cap: `with_params`/`reset` enforce `window_log <= 30` so the
2829        // eviction band `2 * max_window_size` stays below `u32::MAX`
2830        // with headroom for one MAX_BLOCK_SIZE pending block — the
2831        // kernel asserts `data.len() <= u32::MAX`. A large enough
2832        // dictionary could otherwise push `max_window_size` past
2833        // that ceiling via the `saturating_add` below and silently
2834        // re-introduce the same overflow the `window_log` cap was
2835        // designed to prevent. Clamp the post-priming size so the
2836        // doubled-band-plus-block invariant survives.
2837        use super::match_table::storage::MAX_PRIMED_WINDOW_SIZE;
2838
2839        // `requested_dict_budget` is what the caller asked for;
2840        // `base_max_window_size` snapshots the pre-priming cap so we
2841        // can compute how much window the cap actually GRANTED below.
2842        // The cap may clip the requested growth, in which case the
2843        // bookkeeping (`dictionary_retained_budget` retire path) must
2844        // track only the granted portion — otherwise
2845        // `retire_dictionary_budget()` would later reclaim more than
2846        // was actually added and shrink the matcher below its real
2847        // base window (and `cap = 2 * max_window_size` would shrink
2848        // with it, risking under-allocation on subsequent commits).
2849        // The `granted_retained_budget` calculation further below is
2850        // the load-bearing piece — see its block-level comment for
2851        // the post-clip / post-uncommitted-tail math.
2852        let requested_dict_budget = dict_content.len();
2853        let base_max_window_size = match self.active_backend() {
2854            super::strategy::BackendTag::Simple => self.simple_mut().max_window_size,
2855            super::strategy::BackendTag::Dfast => self.dfast_matcher_mut().max_window_size,
2856            super::strategy::BackendTag::Row => self.row_matcher_mut().max_window_size,
2857            super::strategy::BackendTag::HashChain => self.hc_matcher_mut().table.max_window_size,
2858        };
2859        match self.active_backend() {
2860            super::strategy::BackendTag::Simple => {
2861                let matcher = self.simple_mut();
2862                matcher.max_window_size = matcher
2863                    .max_window_size
2864                    .saturating_add(requested_dict_budget)
2865                    .min(MAX_PRIMED_WINDOW_SIZE);
2866            }
2867            super::strategy::BackendTag::Dfast => {
2868                let matcher = self.dfast_matcher_mut();
2869                matcher.max_window_size = matcher
2870                    .max_window_size
2871                    .saturating_add(requested_dict_budget)
2872                    .min(MAX_PRIMED_WINDOW_SIZE);
2873            }
2874            super::strategy::BackendTag::Row => {
2875                let matcher = self.row_matcher_mut();
2876                matcher.max_window_size = matcher
2877                    .max_window_size
2878                    .saturating_add(requested_dict_budget)
2879                    .min(MAX_PRIMED_WINDOW_SIZE);
2880            }
2881            super::strategy::BackendTag::HashChain => {
2882                let matcher = self.hc_matcher_mut();
2883                matcher.table.max_window_size = matcher
2884                    .table
2885                    .max_window_size
2886                    .saturating_add(requested_dict_budget)
2887                    .min(MAX_PRIMED_WINDOW_SIZE);
2888            }
2889        }
2890
2891        let mut start = 0usize;
2892        let mut committed_dict_budget = 0usize;
2893        // insert_position needs 4 bytes of lookahead for hashing;
2894        // backfill_boundary_positions re-visits tail positions once the
2895        // next slice extends history, but cannot hash <4 byte fragments.
2896        let min_primed_tail = match self.active_backend() {
2897            super::strategy::BackendTag::Simple => MIN_MATCH_LEN,
2898            super::strategy::BackendTag::Dfast
2899            | super::strategy::BackendTag::Row
2900            | super::strategy::BackendTag::HashChain => 4,
2901        };
2902        while start < dict_content.len() {
2903            let end = (start + self.slice_size).min(dict_content.len());
2904            if end - start < min_primed_tail {
2905                break;
2906            }
2907            // Stage the dict chunk WITHOUT `get_next_space`'s
2908            // `resize(slice_size, 0)` zero-fill: that memsets a full
2909            // block-sized buffer (up to ~128 KiB) every frame only to have it
2910            // `clear()`-ed and overwritten by the dict bytes on the very next
2911            // lines — pure waste (measured ~10% of the small dict encode).
2912            // Reuse a pooled buffer's capacity if one is free (the prime/skip
2913            // cycle recycles them back), else allocate exactly the chunk.
2914            // Mirrors upstream zstd, which references the CDict content rather
2915            // than zero-filling a fresh window per frame.
2916            let mut space = self.vec_pool.pop().unwrap_or_default();
2917            space.clear();
2918            space.extend_from_slice(&dict_content[start..end]);
2919            self.commit_space(space);
2920            self.skip_matching_for_dictionary_priming();
2921            committed_dict_budget += end - start;
2922            start = end;
2923        }
2924
2925        // Derive `granted_retained_budget` directly from the two real
2926        // bounds — bytes actually committed and bytes the cap allows
2927        // — instead of doing a cap-clip pass followed by an
2928        // uncommitted-tail subtract. Previous shape double-discounted
2929        // when the cap clipped: clip lost `(requested - allowed)`,
2930        // then tail-subtract lost ANOTHER `(requested - committed)`,
2931        // leaving `max_window_size` shy of the dictionary that was
2932        // actually retained (e.g. cap=900, committed=998, uncommitted=2
2933        // landed at granted=898 instead of the correct 900).
2934        let capped_retained_budget = MAX_PRIMED_WINDOW_SIZE.saturating_sub(base_max_window_size);
2935        let granted_retained_budget = committed_dict_budget.min(capped_retained_budget);
2936        let final_max_window_size = base_max_window_size.saturating_add(granted_retained_budget);
2937        match self.active_backend() {
2938            super::strategy::BackendTag::Simple => {
2939                self.simple_mut().max_window_size = final_max_window_size;
2940            }
2941            super::strategy::BackendTag::Dfast => {
2942                self.dfast_matcher_mut().max_window_size = final_max_window_size;
2943            }
2944            super::strategy::BackendTag::Row => {
2945                self.row_matcher_mut().max_window_size = final_max_window_size;
2946            }
2947            super::strategy::BackendTag::HashChain => {
2948                self.hc_matcher_mut().table.max_window_size = final_max_window_size;
2949            }
2950        }
2951        if granted_retained_budget > 0 {
2952            self.dictionary_retained_budget = self
2953                .dictionary_retained_budget
2954                .saturating_add(granted_retained_budget);
2955        }
2956        if self.active_backend() == super::strategy::BackendTag::HashChain {
2957            // Recompute the lazy-HC attach decision made per-chunk in
2958            // `skip_matching_for_dictionary_priming` (stable across the prime —
2959            // `reset_size_log` does not change here).
2960            //
2961            // The HC attach/copy mode is deliberately NOT folded into `PrimedKey`
2962            // (unlike Fast `fast_attach`). Fast attach builds a separate dict
2963            // table whose dimensions differ from the copy-mode live table, so a
2964            // cross-mode restore would install mismatched table geometry and the
2965            // encoder could search past the frame window (undecodable). The two
2966            // HC modes share identical window geometry: `max_window_size` and the
2967            // dictionary limit are both set ABOVE this branch (the same value in
2968            // either mode), and the live chain table dimensions come from the
2969            // resolved `params` the key already pins. The modes differ only in
2970            // WHERE the committed dict lives — a single-link `dms` (attach) vs
2971            // merged into the live chain (copy) — both producing valid matches at
2972            // in-window offsets. Upstream zstd makes the same observation: attach
2973            // (`ZSTD_resetCCtx_byAttachingCDict`) and copy
2974            // (`ZSTD_resetCCtx_byCopyingCDict`) both keep the caller's
2975            // `windowLog`; the choice is a memory/speed trade-off, not a wire
2976            // contract. So restoring an attach snapshot where this frame would
2977            // have copied (or vice versa) yields a decodable frame that may only
2978            // differ in which matches are found (ratio) — algorithmic freedom, not
2979            // a defect. Keying on the mode would instead force a re-prime across
2980            // the cutoff, re-adding the per-frame cost this snapshot path removes.
2981            //
2982            // In practice the public reuse path (`compress_independent_frame`)
2983            // only ever captures AND restores the COPY-mode snapshot — capture is
2984            // gated on the above-cutoff source size, so a restored frame always
2985            // matches the captured mode. `hc_dict_snapshot_reuse_roundtrips` pins
2986            // that same-mode reuse decodes; the driver-level cross-mode restore is
2987            // accepted (not refused) per
2988            // `primed_snapshot_fast_attach_does_not_over_key_non_simple_backends`.
2989            let attach = self.hc_dict_attach_mode();
2990            let table = &mut self.hc_matcher_mut().table;
2991            table.set_dictionary_limit_from_primed_bytes(committed_dict_budget);
2992            // Build the dictMatchState over the committed dict (front of history)
2993            // so `find_best_match` dual-probes it with its own compare budget —
2994            // but ONLY in ATTACH mode. BT/optimal attach → DUBT dms; lazy-HC
2995            // attach → single-link hash-chain dms. COPY mode (large known source,
2996            // both BT and lazy-HC) already merged the dict into the live tree /
2997            // chain in `skip_matching_for_dictionary_priming`, so it carries no
2998            // separate dms — drop any stale one.
2999            if !attach {
3000                table.dms.invalidate();
3001            } else if table.uses_bt {
3002                table.prime_dms_bt(committed_dict_budget);
3003            } else {
3004                table.prime_dms_hc(committed_dict_budget);
3005            }
3006        }
3007        // CDict-equivalent: now that every dict chunk is indexed, mark the
3008        // Fast-backend dict table primed so the next frame's re-prime reuses
3009        // it (skips the re-hash) while still re-committing the dict bytes to
3010        // history. No-op when the attach path built no table (copy mode or a
3011        // sub-8-byte dict) — `mark_dict_primed` self-guards on table presence.
3012        match self.active_backend() {
3013            super::strategy::BackendTag::Simple => self.simple_mut().mark_dict_primed(),
3014            super::strategy::BackendTag::Dfast => self.dfast_matcher_mut().mark_dict_primed(),
3015            super::strategy::BackendTag::Row => self.row_matcher_mut().mark_dict_primed(),
3016            _ => {}
3017        }
3018    }
3019
3020    fn restore_primed_dictionary(&mut self, level: super::CompressionLevel) -> bool {
3021        // Only the (storage, dictionary_retained_budget) pair is what
3022        // `prime_with_dictionary` writes; restoring them reproduces the
3023        // post-prime state exactly. Gated on the FULL resolved key (level + the
3024        // resolved `LevelParams` + the active backend's table width), not just
3025        // the level: `reset` resolves the hint into a window/table geometry, so a
3026        // same-level snapshot taken at a hint that resolved to a different shape
3027        // carries a `storage.max_window_size` / table dimensions that no longer
3028        // match this reset. Restoring it would let the encoder search past the
3029        // frame header's window (an undecodable match), so on a key mismatch we
3030        // refuse and the caller re-primes.
3031        let Some((params, table_bits, fast_attach, ldm)) = self.reset_shape else {
3032            return false;
3033        };
3034        let key = PrimedKey {
3035            level,
3036            params,
3037            table_bits,
3038            fast_attach,
3039            ldm,
3040        };
3041        let Some((snapshot, budget, captured_key)) = &self.primed else {
3042            return false;
3043        };
3044        if *captured_key != key {
3045            return false;
3046        }
3047        let budget = *budget;
3048        match (&mut self.storage, snapshot) {
3049            // Same-variant Fast restore: copy the snapshot into the retained
3050            // live storage. `clone_from` reuses the history / hash-table /
3051            // dict-table buffers, so this is the upstream zstd CDict table-copy
3052            // regime's cost (pure copies) instead of a full per-frame
3053            // allocation + copy + drop cycle.
3054            (MatcherStorage::Simple(live), MatcherStorage::Simple(snap)) => {
3055                live.clone_from(snap);
3056            }
3057            // Same-variant HC lazy/greedy restore (non-BT): the snapshot keeps
3058            // the full primed hash/chain tables (capture's non-BT full clone),
3059            // so `clone_from` reuses the live history/hash/chain/dms buffers in
3060            // place — upstream zstd reuses the CDict tables rather than reallocating
3061            // them. This is the per-frame allocate+copy+drop that dominated
3062            // small `compress-dict` HC frames (5-7x vs C). BT (`uses_bt`)
3063            // snapshots drop their live tables, so they stay on the realloc
3064            // path below.
3065            (MatcherStorage::HashChain(live), MatcherStorage::HashChain(snap))
3066                if !snap.table.uses_bt =>
3067            {
3068                live.table.clone_from(&snap.table);
3069                live.hc.clone_from(&snap.hc);
3070                live.strategy_tag = snap.strategy_tag;
3071                // backend is `HcBackend::Hc` (zero-sized) for non-BT levels;
3072                // the live one is already correct for this resolved key.
3073            }
3074            (live, snapshot_storage) => {
3075                let mut storage = snapshot_storage.clone();
3076                // This arm handles the binary-tree backend. In ATTACH mode the
3077                // snapshot was stored WITHOUT its live hash / chain / hash3
3078                // tables (they hold no dictionary entries — the dict lives in
3079                // `dms` + history; see `capture_primed_dictionary`), so
3080                // `ensure_tables` re-allocates them zeroed to the snapshot's
3081                // geometry, exactly reproducing the post-prime state (all
3082                // `HC_EMPTY`). In COPY mode the snapshot retained its FULL live
3083                // tree (the dict was merged into it, no `dms`), so the tables are
3084                // already present at the right length and `ensure_tables` — which
3085                // only allocates on a length mismatch — leaves them untouched.
3086                // Either way this is a full storage replace, so no stale
3087                // live-table entry from a prior frame can survive.
3088                if let MatcherStorage::HashChain(hc) = &mut storage {
3089                    hc.table.ensure_tables();
3090                }
3091                // The snapshot does not retain the LDM producer (it holds no
3092                // dict state; see `capture_primed_dictionary`). Carry over the
3093                // frame's freshly-reset producer — built this frame by `reset`
3094                // with the same params the snapshot key pins, and empty (no
3095                // input processed yet), so it is equivalent to the producer
3096                // the snapshot was captured with.
3097                #[cfg(feature = "hash")]
3098                {
3099                    let fresh_ldm = if let MatcherStorage::HashChain(hc) = live {
3100                        hc.take_ldm_producer()
3101                    } else {
3102                        None
3103                    };
3104                    if let MatcherStorage::HashChain(hc) = &mut storage {
3105                        hc.set_ldm_producer(fresh_ldm);
3106                    }
3107                }
3108                *live = storage;
3109            }
3110        }
3111        self.dictionary_retained_budget = budget;
3112        true
3113    }
3114
3115    fn capture_primed_dictionary(&mut self, level: super::CompressionLevel) {
3116        // No resolved shape means `reset` has not run for this frame — nothing
3117        // valid to key a snapshot on, so skip the capture.
3118        let Some((params, table_bits, fast_attach, ldm)) = self.reset_shape else {
3119            return;
3120        };
3121        let key = PrimedKey {
3122            level,
3123            params,
3124            table_bits,
3125            fast_attach,
3126            ldm,
3127        };
3128        // CDict-equivalent retained state. A binary-tree level in ATTACH mode
3129        // decouples the dictionary into `dms` (the upstream zstd `dictMatchState`); its
3130        // live hash / chain / hash3 tables carry NO dict entries
3131        // (`skip_matching_dict_bt` keeps the dict out of the live tree), so they
3132        // are pure zeros. Storing them in the snapshot wastes the full table
3133        // footprint (a second window-tier table set resident for the whole
3134        // compress). Instead, move the live tables OUT of the working storage,
3135        // clone only the dict-state (history + `dms` + window/offset/dict-limit),
3136        // then move the live tables back — the snapshot keeps just what upstream zstd's
3137        // CDict keeps, and `restore_primed_dictionary` re-allocates the zeroed
3138        // live tables. Every other case keeps the dict reachable through the live
3139        // structure, so the snapshot must retain the full tables (full clone):
3140        // lazy-HC attach (it DOES prime a hash-chain `dms`, but the live chain is
3141        // still the search structure, so the tables must travel) and COPY mode for
3142        // BOTH BT and lazy-HC (`dms` invalidated, dict merged into the live tree /
3143        // chain). `uses_bt && dms.is_primed()` is therefore the exact "decoupled"
3144        // signal — true only for the BT attach prime; lazy-HC attach primes `dms`
3145        // too but is intentionally NOT decoupled.
3146        let bt_decoupled = matches!(
3147            &self.storage,
3148            MatcherStorage::HashChain(hc) if hc.table.uses_bt && hc.table.dms.is_primed()
3149        );
3150        if bt_decoupled {
3151            let MatcherStorage::HashChain(hc) = &mut self.storage else {
3152                unreachable!("bt_decoupled implies HashChain storage");
3153            };
3154            let hash_table = core::mem::take(&mut hc.table.hash_table);
3155            let chain_table = core::mem::take(&mut hc.table.chain_table);
3156            let hash3_table = core::mem::take(&mut hc.table.hash3_table);
3157            // The LDM producer carries no dictionary state (LDM is not
3158            // dict-primed; its hash table is empty at capture), so it is not
3159            // retained either — `restore` reinstates the frame's freshly-reset
3160            // producer. Take it out so the clone does not duplicate its table.
3161            #[cfg(feature = "hash")]
3162            let ldm_producer = hc.take_ldm_producer();
3163            // Clone the dict-state-only storage (live tables now empty Vecs,
3164            // LDM producer detached).
3165            let snapshot = self.storage.clone();
3166            // Move the live tables (and LDM producer) back into the working storage.
3167            let MatcherStorage::HashChain(hc) = &mut self.storage else {
3168                unreachable!("storage variant is stable across the take/put");
3169            };
3170            hc.table.hash_table = hash_table;
3171            hc.table.chain_table = chain_table;
3172            hc.table.hash3_table = hash3_table;
3173            #[cfg(feature = "hash")]
3174            hc.set_ldm_producer(ldm_producer);
3175            self.primed = Some((snapshot, self.dictionary_retained_budget, key));
3176        } else {
3177            self.primed = Some((self.storage.clone(), self.dictionary_retained_budget, key));
3178        }
3179    }
3180
3181    fn invalidate_primed_dictionary(&mut self) {
3182        self.primed = None;
3183        // Drop the Fast-backend CDict-equivalent table cache too: it is keyed
3184        // to the dictionary being removed / replaced. Left in place, the next
3185        // same-params `reset` would retain it and the kernel would probe a
3186        // dict region whose bytes are no longer re-committed to history.
3187        match self.active_backend() {
3188            super::strategy::BackendTag::Simple => self.simple_mut().invalidate_dict_cache(),
3189            super::strategy::BackendTag::Dfast => self.dfast_matcher_mut().invalidate_dict_cache(),
3190            // Row keeps its attach index across frames (like Simple/Dfast),
3191            // so a dictionary swap must drop its cached dict rows too;
3192            // otherwise the next small/unknown-size frame reuses stale
3193            // attach state through `prime_dict_attach_current_block`.
3194            super::strategy::BackendTag::Row => self.row_matcher_mut().invalidate_dict_cache(),
3195            // The BT dms tree is keyed to the dict bytes; `prime_dms_bt`
3196            // skips the rebuild while its shape matches, so a swapped
3197            // dictionary of the same length would otherwise keep serving the
3198            // OLD dictionary's tree.
3199            super::strategy::BackendTag::HashChain => {
3200                self.hc_matcher_mut().table.dms.invalidate();
3201            }
3202        }
3203    }
3204
3205    fn seed_dictionary_entropy(
3206        &mut self,
3207        huff: Option<&crate::huff0::huff0_encoder::HuffmanTable>,
3208        ll: Option<&crate::fse::fse_encoder::FSETable>,
3209        ml: Option<&crate::fse::fse_encoder::FSETable>,
3210        of: Option<&crate::fse::fse_encoder::FSETable>,
3211    ) {
3212        if self.active_backend() == super::strategy::BackendTag::HashChain {
3213            self.hc_matcher_mut()
3214                .seed_dictionary_entropy(huff, ll, ml, of);
3215        }
3216    }
3217
3218    fn window_size(&self) -> u64 {
3219        self.reported_window_size as u64
3220    }
3221
3222    fn get_next_space(&mut self) -> Vec<u8> {
3223        if let Some(mut space) = self.vec_pool.pop() {
3224            if space.len() > self.slice_size {
3225                space.truncate(self.slice_size);
3226            }
3227            if space.len() < self.slice_size {
3228                space.resize(self.slice_size, 0);
3229            }
3230            return space;
3231        }
3232        alloc::vec![0; self.slice_size]
3233    }
3234
3235    fn get_last_space(&mut self) -> &[u8] {
3236        match &self.storage {
3237            MatcherStorage::Simple(m) => m.last_committed_space(),
3238            MatcherStorage::Dfast(m) => m.get_last_space(),
3239            MatcherStorage::Row(m) => m.get_last_space(),
3240            MatcherStorage::HashChain(m) => m.table.get_last_space(),
3241        }
3242    }
3243
3244    fn commit_space(&mut self, space: Vec<u8>) {
3245        let mut evicted_bytes = 0usize;
3246        // Split borrows manually so the `add_data` closures can write
3247        // into `vec_pool` while the backend itself holds an exclusive
3248        // borrow via `storage`. (Suffix-store recycling went away
3249        // with the legacy `MatchGenerator`; the FastKernelMatcher
3250        // arm below has no pool interaction.)
3251        let vec_pool = &mut self.vec_pool;
3252        match &mut self.storage {
3253            MatcherStorage::Simple(m) => {
3254                // FastKernelMatcher owns its history as a single
3255                // flat Vec<u8> and the hash table as a Vec<u32> —
3256                // neither recycles into the driver-side pools. The
3257                // eager pre-commit eviction inside
3258                // `FastKernelMatcher::accept_data` drops bytes when
3259                // accepting this block would push history past 2×
3260                // max_window_size; that delta is what feeds
3261                // `evicted_bytes` here via the `pre / post`
3262                // history-length comparison.
3263                let pre = m.history_len_for_eviction_accounting();
3264                m.accept_data(space);
3265                let post = m.history_len_for_eviction_accounting();
3266                // `accept_data` performs eager pre-commit window
3267                // eviction (so this `pre - post` delta correctly
3268                // feeds the dictionary-budget retire flow). See
3269                // `FastKernelMatcher::accept_data` for the
3270                // commit-time-visibility rationale (closes #216
3271                // CodeRabbit review #5 / Copilot review #1: without
3272                // eager eviction, the delta was always 0 and the
3273                // dict budget never retired, leaving max_window_size
3274                // inflated post-dict-prime → matcher could emit
3275                // offsets exceeding the frame header's window).
3276                evicted_bytes += pre.saturating_sub(post);
3277            }
3278            MatcherStorage::Dfast(m) => {
3279                // Dfast's `add_data` callback receives the INPUT
3280                // `Vec<u8>` for pool recycling (Dfast stores its
3281                // bytes in the contiguous `history` buffer, not in
3282                // per-block Vecs — there is no per-block buffer to
3283                // pop off and hand back). Counting `data.len()` as
3284                // evicted bytes would conflate "new bytes ingested"
3285                // with "old bytes evicted from window"; the two
3286                // happen to coincide when the previous window was
3287                // saturated and the new input fills it 1:1, but
3288                // diverge when the eviction pop-loop drops blocks
3289                // of a different size than the incoming input. The
3290                // `dictionary_retained_budget` retire decision
3291                // downstream then gets driven by inflated eviction
3292                // counts and shrinks `max_window_size` prematurely.
3293                //
3294                // Derive the real eviction delta from `window_size`
3295                // before/after the call. The pop loop inside
3296                // `add_data` decrements `window_size` by each
3297                // evicted block length and then the final
3298                // `extend_from_slice + push_back` adds `space_len`,
3299                // so `evicted = pre + space_len - post`.
3300                let pre = m.window_size;
3301                let space_len = space.len();
3302                m.add_data(space, |data| {
3303                    // Same per-block recycle as the HashChain arm: push
3304                    // the spent input buffer back as-is rather than
3305                    // zero-filling to capacity. `add_data` mirrors the
3306                    // bytes into `history` and calls this every block, so
3307                    // capacity-wide zeroing would be hot-path waste;
3308                    // `get_next_space` zeroes at most `slice_size` bytes
3309                    // when it later reuses the buffer.
3310                    vec_pool.push(data);
3311                });
3312                // Plain `+` (the `saturating_sub` floors at 0): `pre` + one
3313                // block are byte counts bounded by the window, no overflow.
3314                evicted_bytes += (pre + space_len).saturating_sub(m.window_size);
3315            }
3316            MatcherStorage::Row(m) => {
3317                // RowMatchGenerator::add_data recycles the *input* buffer
3318                // through this callback every commit (its bytes are mirrored
3319                // into `history`), not the evicted chunks. Derive the eviction
3320                // delta from `window_size` before/after — `evicted = pre +
3321                // space_len - post` — exactly like the Simple / HashChain arms.
3322                // Counting the callback argument as evicted would charge the
3323                // whole committed block as evicted and prematurely retire
3324                // dictionary budget on a window that evicts nothing.
3325                let pre = m.window_size;
3326                let space_len = space.len();
3327                m.add_data(space, |data| {
3328                    // Recycle the spent buffer as-is; `add_data` runs this for
3329                    // every committed block, so zero-filling to capacity here
3330                    // would be hot-path waste (`get_next_space` zeroes at most
3331                    // `slice_size` on reuse).
3332                    vec_pool.push(data);
3333                });
3334                // Plain `+` (the `saturating_sub` floors at 0): `pre` + one
3335                // block are byte counts bounded by the window, no overflow.
3336                evicted_bytes += (pre + space_len).saturating_sub(m.window_size);
3337            }
3338            MatcherStorage::HashChain(m) => {
3339                // MatchTable::add_data now recycles the *incoming* buffer
3340                // through `reuse_space` (its bytes are copied into the
3341                // contiguous `history` mirror), so the callback no longer
3342                // reports evicted chunks. Derive the eviction delta from
3343                // `window_size` before/after, exactly like the Simple arm:
3344                // `evicted = pre + space_len - post`.
3345                let pre = m.table.window_size;
3346                let space_len = space.len();
3347                m.table.add_data(space, |data| {
3348                    // Recycle the spent input buffer to the pool as-is.
3349                    // `add_data` runs this callback for every committed
3350                    // block (the bytes are mirrored into `history`), so
3351                    // growing the buffer to its full capacity here would
3352                    // zero the whole allocation on the hot path.
3353                    // `get_next_space` resizes a popped buffer to
3354                    // `slice_size` on demand, touching at most
3355                    // `slice_size` bytes — never the larger capacity the
3356                    // pool retains.
3357                    vec_pool.push(data);
3358                });
3359                // Plain `+` (the `saturating_sub` floors at 0): byte counts
3360                // bounded by the window, no overflow.
3361                evicted_bytes += (pre + space_len).saturating_sub(m.table.window_size);
3362            }
3363        }
3364        // Gate the second backend trim pass on actual budget
3365        // reclamation. Without it, every slice commit on the
3366        // no-dictionary / no-eviction path (the common case) would
3367        // run a backend `match` ladder + `trim_to_window` early-out
3368        // for no reason — `trim_after_budget_retire` only does
3369        // meaningful work when `retire_dictionary_budget` shrank
3370        // `max_window_size` enough to make the backend's
3371        // `window_size > max_window_size` invariant trigger
3372        // eviction.
3373        if self.retire_dictionary_budget(evicted_bytes) {
3374            self.trim_after_budget_retire();
3375        }
3376    }
3377
3378    fn start_matching(&mut self, mut handle_sequence: impl for<'a> FnMut(Sequence<'a>)) {
3379        use super::strategy::{self, StrategyTag};
3380        // Borrowed one-shot Fast path: if the frame driver staged a
3381        // block range via `set_borrowed_block`, scan it in place against
3382        // the borrowed window instead of the owned committed block. Only
3383        // the Simple backend is instrumented (the gate guarantees it),
3384        // and the stage is consumed so the next block re-stages.
3385        if let Some((block_start, block_end)) = self.borrowed_pending.take() {
3386            match self.active_backend() {
3387                super::strategy::BackendTag::Simple => {
3388                    let m = self.simple_mut();
3389                    if m.dict_is_attached() {
3390                        // Dict-attach borrowed scan: live matches read the
3391                        // borrowed input in place, dict matches read the
3392                        // committed dict prefix via the 2-segment counter.
3393                        m.start_matching_borrowed_dict(
3394                            block_start,
3395                            block_end,
3396                            &mut handle_sequence,
3397                        );
3398                    } else {
3399                        m.start_matching_borrowed(block_start, block_end, &mut handle_sequence);
3400                    }
3401                }
3402                super::strategy::BackendTag::Dfast => self
3403                    .dfast_matcher_mut()
3404                    .start_matching_borrowed(block_start, block_end, &mut handle_sequence),
3405                super::strategy::BackendTag::Row => {
3406                    // Same greedy/lazy parse split as the owned RowHash arm.
3407                    let greedy = self.parse == super::strategy::ParseMode::Greedy;
3408                    self.row_matcher_mut().start_matching_borrowed(
3409                        block_start,
3410                        block_end,
3411                        greedy,
3412                        &mut handle_sequence,
3413                    );
3414                }
3415                super::strategy::BackendTag::HashChain => match self.search {
3416                    super::strategy::SearchMethod::HashChain => self
3417                        .hc_matcher_mut()
3418                        .start_matching_lazy_borrowed(block_start, block_end, &mut handle_sequence),
3419                    super::strategy::SearchMethod::BinaryTree => {
3420                        // Run the SAME BT dispatch as the owned BinaryTree arm
3421                        // below — every BT body reads its range via
3422                        // current_block_range() and bytes via live_history()
3423                        // (borrowed-aware), so the staged block is scanned in
3424                        // place. The table was already staged by
3425                        // `set_borrowed_block` (the HashChain arm at the top of
3426                        // this file calls `table.stage_borrowed_block` with the
3427                        // same range, and `borrowed_pending` is set only there),
3428                        // so no re-stage is needed here.
3429                        // Only btlazy2 reaches the borrowed BinaryTree scan:
3430                        // `borrowed_supported()` keeps the optimal parsers
3431                        // (BtOpt/BtUltra/BtUltra2) on the owned path, and
3432                        // `set_borrowed_block` asserts that predicate before any
3433                        // range is staged, so an optimal strategy_tag can never
3434                        // arrive here.
3435                        match self.strategy_tag {
3436                            StrategyTag::Btlazy2 => self
3437                                .hc_matcher_mut()
3438                                .start_matching_btlazy2(&mut handle_sequence),
3439                            other => unreachable!(
3440                                "borrowed BinaryTree scan is only supported for Btlazy2, got {other:?}"
3441                            ),
3442                        }
3443                    }
3444                    other => {
3445                        unreachable!("HashChain backend with unexpected search {other:?}")
3446                    }
3447                },
3448            }
3449            return;
3450        }
3451        // Decoupled parse×search dispatch (fires once per block). The
3452        // search axis (`self.search`) picks the candidate-finding backend;
3453        // the parse axis (greedy vs lazy depth) is carried by the
3454        // backend's runtime `lazy_depth`, set per level at `reset()`.
3455        // The two are independent, so any parse can run on any search
3456        // backend. The `BinaryTree` arm still selects the opt `Strategy`
3457        // ZST off `strategy_tag` so `compress_block::<S>` keeps its
3458        // const-folded optimal-parser monomorphisation.
3459        use super::strategy::SearchMethod;
3460        match self.search {
3461            SearchMethod::Fast => {
3462                self.simple_mut().start_matching(&mut handle_sequence);
3463                self.recycle_simple_space();
3464            }
3465            SearchMethod::DoubleFast => {
3466                self.dfast_matcher_mut()
3467                    .start_matching(&mut handle_sequence);
3468            }
3469            SearchMethod::RowHash => {
3470                // Greedy parse (depth 0) = upstream zstd-greedy entry (default
3471                // `ip + 1` start, greedy repcode commit); lazy / lazy2 use
3472                // the `pick_lazy_match` lookahead entry (reads `lazy_depth`).
3473                // Both bare entries dispatch on `row_log` internally into the
3474                // const-`ROW_LOG` hot loop (upstream zstd per-rowLog variant table).
3475                let greedy = self.parse == super::strategy::ParseMode::Greedy;
3476                let row = self.row_matcher_mut();
3477                if greedy {
3478                    row.start_matching_greedy(&mut handle_sequence);
3479                } else {
3480                    row.start_matching(&mut handle_sequence);
3481                }
3482            }
3483            SearchMethod::HashChain => {
3484                // Greedy/lazy/lazy2 all flow through the lazy parser; it
3485                // reads `hc.lazy_depth` (0 = greedy commit).
3486                self.hc_matcher_mut()
3487                    .start_matching_lazy(&mut handle_sequence);
3488            }
3489            SearchMethod::BinaryTree => match self.strategy_tag {
3490                StrategyTag::Btlazy2 => self
3491                    .hc_matcher_mut()
3492                    .start_matching_btlazy2(&mut handle_sequence),
3493                StrategyTag::BtOpt => self.compress_block::<strategy::BtOpt>(&mut handle_sequence),
3494                StrategyTag::BtUltra => {
3495                    self.compress_block::<strategy::BtUltra>(&mut handle_sequence)
3496                }
3497                StrategyTag::BtUltra2 => {
3498                    self.compress_block::<strategy::BtUltra2>(&mut handle_sequence)
3499                }
3500                _ => unreachable!(
3501                    "SearchMethod::BinaryTree requires a BT strategy tag (Btlazy2/BtOpt/BtUltra/BtUltra2)"
3502                ),
3503            },
3504        }
3505    }
3506
3507    fn skip_matching(&mut self) {
3508        self.skip_matching_with_hint(None);
3509    }
3510
3511    fn skip_matching_with_hint(&mut self, incompressible_hint: Option<bool>) {
3512        // Borrowed one-shot Fast path: a staged block range routes to the
3513        // borrowed skip (records the range for `get_last_space`, primes
3514        // hashes on the dict-priming hint) with no owned-history append
3515        // and nothing to recycle. Stage is consumed.
3516        if let Some((block_start, block_end)) = self.borrowed_pending.take() {
3517            match self.active_backend() {
3518                super::strategy::BackendTag::Simple => self.simple_mut().skip_matching_borrowed(
3519                    block_start,
3520                    block_end,
3521                    incompressible_hint,
3522                ),
3523                super::strategy::BackendTag::Dfast => self
3524                    .dfast_matcher_mut()
3525                    .skip_matching_borrowed(block_start, block_end, incompressible_hint),
3526                super::strategy::BackendTag::Row => self.row_matcher_mut().skip_matching_borrowed(
3527                    block_start,
3528                    block_end,
3529                    incompressible_hint,
3530                ),
3531                super::strategy::BackendTag::HashChain => self
3532                    .hc_matcher_mut()
3533                    .skip_matching_borrowed(block_start, block_end, incompressible_hint),
3534            }
3535            return;
3536        }
3537        match self.active_backend() {
3538            super::strategy::BackendTag::Simple => {
3539                self.simple_mut()
3540                    .skip_matching_with_hint(incompressible_hint);
3541                self.recycle_simple_space();
3542            }
3543            super::strategy::BackendTag::Dfast => {
3544                self.dfast_matcher_mut().skip_matching(incompressible_hint)
3545            }
3546            super::strategy::BackendTag::Row => self
3547                .row_matcher_mut()
3548                .skip_matching_with_hint(incompressible_hint),
3549            super::strategy::BackendTag::HashChain => {
3550                self.hc_matcher_mut().skip_matching(incompressible_hint)
3551            }
3552        }
3553    }
3554}
3555
3556impl MatchGeneratorDriver {
3557    /// Monomorphised optimal-parser entry point. Only the `BinaryTree`
3558    /// search arm of [`Matcher::start_matching`] routes here, selecting
3559    /// the concrete opt `S: Strategy` (BtOpt / BtUltra / BtUltra2) off
3560    /// `strategy_tag`, so the optimiser keeps the cost-model predicates
3561    /// (`S::USE_BT` / `S::USE_HASH3` / `S::ACCURATE_PRICE` /
3562    /// `S::TWO_PASS_SEED`) const-folded per strategy. The non-opt search
3563    /// backends (Fast / DoubleFast / RowHash / HashChain) are dispatched
3564    /// directly off the search axis and never reach this method, so all
3565    /// strategies arriving here are HashChain-backed.
3566    fn compress_block<S: super::strategy::Strategy>(
3567        &mut self,
3568        handle_sequence: &mut impl for<'a> FnMut(Sequence<'a>),
3569    ) {
3570        debug_assert_eq!(S::BACKEND, super::strategy::BackendTag::HashChain);
3571        debug_assert!(
3572            S::USE_BT,
3573            "compress_block only handles the optimal (BT) path"
3574        );
3575        self.hc_matcher_mut()
3576            .start_matching_strategy::<S>(handle_sequence);
3577    }
3578}
3579
3580/// Stage D: backend storage discriminator.
3581///
3582/// HC (lazy / lazy2) modes carry no extra per-frame state beyond the
3583/// shared `MatchTable` and `HcMatcher` runtime knobs, so the
3584/// [`HcBackend::Hc`] variant is zero-sized — no BT scratch is
3585/// allocated. BT-flavoured modes (`btopt` / `btultra` / `btultra2`)
3586/// hold the full [`super::bt::BtMatcher`] inside the
3587/// [`HcBackend::Bt`] variant (cost model, optimal-parser scratch
3588/// arenas, LDM candidate buffer).
3589///
3590/// The discriminator lives next to `parse_mode` so `configure()` can
3591/// promote between the two on a level change without touching the
3592/// `MatchTable` storage.
3593#[derive(Clone)]
3594pub(crate) enum HcBackend {
3595    /// Lazy / lazy2 modes — no per-frame backend state.
3596    Hc,
3597    /// BT-driven modes — owns the optimal parser's per-frame scratch.
3598    /// Boxed so the enum stays pointer-sized: HC-only matchers pay
3599    /// just the `Box`-niche, not the 4 KiB `BtMatcher` payload.
3600    Bt(alloc::boxed::Box<super::bt::BtMatcher>),
3601}
3602
3603impl HcBackend {
3604    /// Heap bytes held by the backend. `Hc` is zero-sized; `Bt` boxes a
3605    /// `BtMatcher`, so count the boxed payload plus its own scratch heap.
3606    fn heap_size(&self) -> usize {
3607        match self {
3608            Self::Hc => 0,
3609            Self::Bt(bt) => core::mem::size_of::<super::bt::BtMatcher>() + bt.heap_size(),
3610        }
3611    }
3612
3613    /// Mutable accessor on the BT matcher; panics if the active
3614    /// backend is `Hc`. The HC-or-Bt branches in orchestrator code use
3615    /// `let HcBackend::Bt(bt) = &self.backend` directly for readonly
3616    /// access — this helper exists so macro bodies that already drive
3617    /// a mutable BT update through the optimal parser can write
3618    /// `$self.backend.bt_mut().X` without an outer `match` ladder.
3619    #[inline(always)]
3620    pub(crate) fn bt_mut(&mut self) -> &mut super::bt::BtMatcher {
3621        match self {
3622            Self::Bt(bt) => bt,
3623            Self::Hc => unreachable!("BT-only accessor called in HC mode"),
3624        }
3625    }
3626}
3627
3628#[derive(Clone)]
3629struct HcMatchGenerator {
3630    /// Shared match-finder storage (window, history, hash / chain /
3631    /// hash3 tables, dictionary-priming flags). Used identically by HC
3632    /// and BT modes; backend-specific table interpretation lives in the
3633    /// matcher methods on this struct.
3634    table: super::match_table::storage::MatchTable,
3635    /// HC runtime knobs (lazy_depth, search_depth, target_len). Always
3636    /// present — BT modes still consult `hc.search_depth` for repcode
3637    /// probing and chain candidate enumeration.
3638    hc: super::hc::HcMatcher,
3639    /// Backend discriminator. [`HcBackend::Hc`] is zero-sized for the
3640    /// lazy / lazy2 path so HC-only generators don't carry the BT
3641    /// optimal-parser scratch buffers. [`HcBackend::Bt`] holds the
3642    /// `BtMatcher` when an optimal mode is configured.
3643    backend: HcBackend,
3644    /// Compile-time strategy tag mirrored from
3645    /// [`MatchGeneratorDriver::strategy_tag`] during `configure()`.
3646    /// The driver hot path never reads this — it dispatches to
3647    /// `compress_block::<S>` from its own tag — but the
3648    /// `#[cfg(test)] start_matching` helper consumes it so artificial
3649    /// test setups still pick the correct concrete `S` for the
3650    /// const-generic optimal parser (BtOpt vs BtUltra vs BtUltra2).
3651    /// Without this field the test path would have to collapse
3652    /// `BtOpt` and `BtUltra` onto the same monomorphisation since
3653    /// `table.uses_bt` / `table.is_btultra2` alone can't tell them
3654    /// apart.
3655    strategy_tag: super::strategy::StrategyTag,
3656}
3657
3658// Plain-data types relocated to [`crate::encoding::opt::types`] and
3659// [`crate::encoding::opt::ldm`] by #111 Phase 1. The use statements at
3660// the top of this file bring them back into scope so the existing
3661// methods on `HcMatchGenerator` compile unchanged.
3662
3663/// `bt_insert_step_no_rebase` body parameterized over the per-CPU
3664/// `count_match_from_indices` symbol. Each kernel-specific wrapper invokes
3665/// the macro with its own `fastpath::<kernel>::count_match_from_indices`
3666/// path so the call resolves inside the wrapper's `#[target_feature]`
3667/// umbrella and inlines instead of paying the function-call ABI per BT walk
3668/// iteration. Used only by `HcMatchGenerator` BT walk wrappers below.
3669///
3670/// Crate-private: the macro body references private `encoding::*`
3671/// modules via `$crate::...`, so it is unusable downstream and is
3672/// re-exported only inside this crate via `pub(crate) use` below.
3673macro_rules! bt_insert_step_no_rebase_body {
3674    ($table:expr, $search_depth:expr, $abs_pos:ident, $current_abs_end:ident, $target_abs:ident, $cmf:path) => {{
3675        let idx = $abs_pos - $table.history_abs_start;
3676        // Borrowed-aware live region (owned: `history[history_start..]`;
3677        // borrowed: the in-place input `[0, block_end)`). Reborrow-then-raw-ptr
3678        // so the slice holds NO borrow and coexists with the `&mut $table`
3679        // binary-tree writes below. Owned is byte-identical (same bytes).
3680        let concat: &[u8] = unsafe {
3681            let lh = $table.live_history();
3682            core::slice::from_raw_parts(lh.as_ptr(), lh.len())
3683        };
3684        if idx + 8 > concat.len() {
3685            return 1;
3686        }
3687        debug_assert!(
3688            $abs_pos <= $current_abs_end,
3689            "BT walker called past current block end"
3690        );
3691        let tail_limit = $current_abs_end - $abs_pos;
3692        let hash = $crate::encoding::match_table::storage::MatchTable::hash_position_at(
3693            concat,
3694            idx,
3695            $table.hash_log,
3696            $table.search_mls,
3697        );
3698        // Prefetch the hash bucket now. For the large L16+ hash table over
3699        // high-entropy input the bucket is L3/DRAM-cold, and unlike upstream's
3700        // monolithic ZSTD_btGetAllMatches (which overlaps this miss with its
3701        // inline rep/hash3 prologue) the read+write of `hash_table[hash]`
3702        // below is reached with nothing to hide it behind — it stalled a large
3703        // share of this function's cycles. Issuing the hint here lets the miss
3704        // overlap the address setup that follows.
3705        #[cfg(all(
3706            target_feature = "sse",
3707            any(target_arch = "x86", target_arch = "x86_64")
3708        ))]
3709        {
3710            #[cfg(target_arch = "x86")]
3711            use core::arch::x86::{_MM_HINT_T0, _mm_prefetch};
3712            #[cfg(target_arch = "x86_64")]
3713            use core::arch::x86_64::{_MM_HINT_T0, _mm_prefetch};
3714            // SAFETY: prefetch is a hint that never faults; `hash` indexes
3715            // `hash_table` directly below, so it is in bounds.
3716            unsafe {
3717                _mm_prefetch($table.hash_table.as_ptr().add(hash).cast(), _MM_HINT_T0);
3718            }
3719            // Prefetch the NEXT position's bucket too. The optimal-parser DP
3720            // advances one position per iteration, so this miss is issued a
3721            // full BT walk plus the next iteration's pre-collect work ahead of
3722            // the collect that will read it — far more lead than the same-call
3723            // hint above, enough to hide the full DRAM latency.
3724            if idx + 1 + 8 <= concat.len() {
3725                let hash_next =
3726                    $crate::encoding::match_table::storage::MatchTable::hash_position_at(
3727                        concat,
3728                        idx + 1,
3729                        $table.hash_log,
3730                        $table.search_mls,
3731                    );
3732                // SAFETY: prefetch never faults; an out-of-range index is a
3733                // harmless no-op hint.
3734                unsafe {
3735                    _mm_prefetch(
3736                        $table.hash_table.as_ptr().add(hash_next).cast(),
3737                        _MM_HINT_T0,
3738                    );
3739                }
3740            }
3741        }
3742        let Some(relative_pos) = $table.relative_position($abs_pos) else {
3743            return 1;
3744        };
3745        let stored = relative_pos + 1;
3746        let bt_mask = $table.bt_mask();
3747        // `abs_pos < bt_mask` legitimately happens for the first BT walk of
3748        // a fresh frame (bt_low effectively "no floor"). Saturating keeps
3749        // the floor at 0 so the `candidate_abs <= bt_low` check never
3750        // triggers early; raw subtraction would underflow into a huge
3751        // sentinel that ALWAYS triggers.
3752        let bt_low = $abs_pos.saturating_sub(bt_mask);
3753        // Hoist the BT pointer-pair base out of `self` once — see the
3754        // collect-matches body for the full rationale (per-step Vec reload +
3755        // bounds check through `&mut self` vs the upstream zstd's raw `U32*` walk).
3756        let chain_ptr = $table.chain_table.as_mut_ptr();
3757        debug_assert_eq!($table.chain_table.len(), 2 << $table.bt_log());
3758        let window_low = $table.window_low_abs_for_target($target_abs);
3759        // `abs_pos + 9` is safe in raw form: `MatchTable::add_data` caps
3760        // total input at `usize::MAX - STREAM_ABS_HEADROOM` (where
3761        // `STREAM_ABS_HEADROOM = HC_OPT_NUM + 16`), so every
3762        // frame-lifetime absolute cursor passed to the BT walker stays
3763        // below `usize::MAX - 9` regardless of stream length or
3764        // pointer width. The guard is hoisted to the data-ingest
3765        // boundary so this per-position site pays zero arithmetic
3766        // overhead in the hot loop.
3767        let mut match_end_abs = $abs_pos + 9;
3768        let mut best_len = 8usize;
3769        let mut compares_left = $search_depth;
3770        let mut common_length_smaller = 0usize;
3771        let mut common_length_larger = 0usize;
3772        let pair_idx = $table.bt_pair_index_for_abs($abs_pos);
3773        let mut smaller_slot = pair_idx;
3774        let mut larger_slot = pair_idx + 1;
3775        let mut match_stored = $table.hash_table[hash];
3776        $table.hash_table[hash] = stored;
3777
3778        while compares_left > 0 {
3779            if match_stored == $crate::encoding::match_table::storage::HC_EMPTY {
3780                break;
3781            }
3782            // Reject stale post-rebase slots whose pre-shift position is below
3783            // `index_shift` explicitly. A `wrapping_sub` maps such a slot to a
3784            // near-`usize::MAX` value that the `>= abs_pos` test only rejects
3785            // while `abs_pos` is far from the integer ceiling; on a
3786            // long-running rebased stream (reachable on 32-bit) `abs_pos` can
3787            // approach the ceiling and the wrapped value can land back inside
3788            // `[window_low, abs_pos)`. `checked_sub` ends the walk on the
3789            // underflow instead. `match_stored != HC_EMPTY` here, so the `- 1`
3790            // cannot underflow.
3791            let Some(candidate_abs) = ($table.position_base + (match_stored as usize - 1))
3792                .checked_sub($table.index_shift)
3793            else {
3794                break;
3795            };
3796            if candidate_abs < window_low || candidate_abs >= $abs_pos {
3797                break;
3798            }
3799            compares_left -= 1;
3800
3801            let next_pair_idx = $table.bt_pair_index_for_abs(candidate_abs);
3802            // SAFETY: `next_pair_idx (+1)` = `2*(candidate_abs & bt_mask) (+1)`
3803            // ≤ `chain_table.len()-1`; `chain_ptr` is the hoisted live base,
3804            // table not realloc'd during the walk.
3805            let next_smaller = unsafe { *chain_ptr.add(next_pair_idx) };
3806            let next_larger = unsafe { *chain_ptr.add(next_pair_idx + 1) };
3807            let seed_len = common_length_smaller.min(common_length_larger);
3808            let candidate_idx = candidate_abs - $table.history_abs_start;
3809            // SAFETY: BT walk invariant — `candidate_idx + tail_limit ≤
3810            // concat.len()` since the candidate is within
3811            // `[history_abs_start, abs_pos)` and `tail_limit ≤
3812            // current_abs_end - abs_pos`.
3813            let match_len = unsafe { $cmf(concat, idx, candidate_idx, tail_limit, seed_len) };
3814
3815            if match_len > best_len {
3816                best_len = match_len;
3817                // `candidate_abs + match_len <= current_abs_end` by BT walk
3818                // invariant — `match_len <= tail_limit = current_abs_end -
3819                // abs_pos` and `candidate_abs < abs_pos`.
3820                let candidate_end = candidate_abs + match_len;
3821                if candidate_end > match_end_abs {
3822                    match_end_abs = candidate_end;
3823                }
3824            }
3825
3826            if match_len >= tail_limit {
3827                break;
3828            }
3829
3830            let candidate_next = candidate_idx + match_len;
3831            let current_next = idx + match_len;
3832            // SAFETY: first-differing positions after a match_len-long prefix;
3833            // match_len < tail_limit (break above) + BT-walk bound
3834            // idx/candidate_idx + tail_limit <= concat.len() keep both in range.
3835            if unsafe {
3836                *concat.get_unchecked(candidate_next) < *concat.get_unchecked(current_next)
3837            } {
3838                // SAFETY: `smaller_slot` holds a valid pair index (init
3839                // `pair_idx`, updated to `next_pair_idx + 1`); the `usize::MAX`
3840                // sentinel is set only just before `break`, never written here.
3841                unsafe { *chain_ptr.add(smaller_slot) = match_stored };
3842                common_length_smaller = match_len;
3843                if candidate_abs <= bt_low {
3844                    smaller_slot = usize::MAX;
3845                    break;
3846                }
3847                smaller_slot = next_pair_idx + 1;
3848                match_stored = next_larger;
3849            } else {
3850                // SAFETY: as above for `larger_slot`.
3851                unsafe { *chain_ptr.add(larger_slot) = match_stored };
3852                common_length_larger = match_len;
3853                if candidate_abs <= bt_low {
3854                    larger_slot = usize::MAX;
3855                    break;
3856                }
3857                larger_slot = next_pair_idx;
3858                match_stored = next_smaller;
3859            }
3860        }
3861
3862        // SAFETY: both slots, when not the `usize::MAX` sentinel, hold valid
3863        // pair indices into the hoisted `chain_table` base.
3864        if smaller_slot != usize::MAX {
3865            unsafe {
3866                *chain_ptr.add(smaller_slot) = $crate::encoding::match_table::storage::HC_EMPTY
3867            };
3868        }
3869        if larger_slot != usize::MAX {
3870            unsafe {
3871                *chain_ptr.add(larger_slot) = $crate::encoding::match_table::storage::HC_EMPTY
3872            };
3873        }
3874
3875        let speed_positions = if best_len > 384 {
3876            (best_len - 384).min(192)
3877        } else {
3878            0
3879        };
3880        // `match_end_abs` is initialized to `abs_pos + 9` and is only
3881        // reassigned inside the `candidate_end > match_end_abs` branch
3882        // above. So even though an individual `candidate_end =
3883        // candidate_abs + match_len` can land below `abs_pos` (the
3884        // candidate sits earlier in history and the match runs short),
3885        // the variable itself never drops below its initial value.
3886        // That gives `match_end_abs ≥ abs_pos + 9 > abs_pos + 8` as a
3887        // loop-wide invariant, so the raw subtraction below cannot
3888        // underflow.
3889        speed_positions.max(match_end_abs - ($abs_pos + 8))
3890    }};
3891}
3892pub(crate) use bt_insert_step_no_rebase_body;
3893
3894/// `build_optimal_plan_impl` body parameterized over the per-CPU
3895/// `collect_optimal_candidates_initialized_<kernel>` method name. Caller
3896/// passes its `&mut self`, the seven DP entry-point arguments, and the
3897/// kernel-specific collect method. Each per-kernel wrapper invokes this
3898/// macro inside its own `#[target_feature]` umbrella so the per-position
3899/// `$collect` call inlines and the entire DP loop runs as one straight-line
3900/// hot path without an ABI barrier between the DP and the match-gathering
3901/// pipeline.
3902///
3903/// Body is ~730 lines but mechanically identical across kernels — the macro
3904/// keeps a single source of truth. The two const generics
3905/// (`ACCURATE_PRICE`, `FAVOR_SMALL_OFFSETS`) come from the wrapper's
3906/// generic parameter list and are referenced as bare identifiers; macro
3907/// hygiene resolves them at the expansion site.
3908/// Upstream zstd `offBase` for the btlazy2 lazy gain heuristic: a match whose offset
3909/// equals one of the three active repeat offsets prices as the cheap repcode
3910/// code (1/2/3); any other offset prices as `offset + 3`. So an equal-length
3911/// repeat-offset match always out-gains an explicit-offset one
3912/// (`zstd_lazy.c` `ZSTD_storeSeq` offBase convention).
3913#[inline]
3914fn btlazy2_offbase(offset: usize, reps: [u32; 3], ll0: bool) -> u32 {
3915    let o = offset as u32;
3916    // Upstream zstd repcode mapping shifts by `ll0` (zero-literal position): the cheap
3917    // codes become rep1 / rep2 / (rep0 - 1) instead of rep0 / rep1 / rep2,
3918    // because at ll0 an offset equal to rep0 is the special rep0-1 case, not
3919    // repcode 1. Scoring offsets against the wrong code at ll0 over-rewards a
3920    // rep0-distance match that does not actually encode as the cheapest code.
3921    if ll0 {
3922        if o == reps[1] {
3923            1
3924        } else if o == reps[2] {
3925            2
3926        } else if reps[0] > 1 && o == reps[0] - 1 {
3927            3
3928        } else {
3929            // Offsets are < window (<= 2^27), so `+ 3` never overflows u32.
3930            o + 3
3931        }
3932    } else if o == reps[0] {
3933        1
3934    } else if o == reps[1] {
3935        2
3936    } else if o == reps[2] {
3937        3
3938    } else {
3939        // Offsets are < window (<= 2^27), so `+ 3` never overflows u32.
3940        o + 3
3941    }
3942}
3943
3944/// Upstream zstd lazy match gain (`matchLength * 4 - ZSTD_highbit32(offBase)`): the
3945/// selection metric that lets a shorter repeat-offset match beat a longer
3946/// explicit-offset one. `offBase >= 1`, so `highbit` is well-defined.
3947#[inline]
3948fn btlazy2_gain(match_len: usize, offset: usize, reps: [u32; 3], ll0: bool) -> i64 {
3949    let offbase = btlazy2_offbase(offset, reps, ll0);
3950    (match_len as i64) * 4 - (31 - offbase.leading_zeros()) as i64
3951}
3952
3953/// Per-kernel body of the `btlazy2` (levels 13-15) greedy/lazy parse over
3954/// the binary-tree match finder. Mirrors `build_optimal_plan_impl_body!`'s
3955/// kernel-dispatch discipline: the wrapper carries the `#[target_feature]`
3956/// umbrella and passes its tier-specific `collect_optimal_candidates_initialized_<kernel>`
3957/// as `$collect`, so the per-position BT collect (and its inlined cpl)
3958/// stays under one umbrella — the runtime `select_kernel()` dispatch happens
3959/// ONCE per block in the bare `start_matching_btlazy2`, never per position.
3960macro_rules! start_matching_btlazy2_body {
3961    ($self:ident, $handle_sequence:ident, $collect:ident, $cmf:path $(,)?) => {{
3962        $self.table.ensure_tables();
3963        // Borrowed-aware: owned → last committed chunk; borrowed → staged block.
3964        let (current_abs_start, current_len) = $self.table.current_block_range();
3965        if current_len == 0 {
3966            return;
3967        }
3968        let current_ptr = $self.table.get_last_space().as_ptr();
3969        // Mutates tables but never reallocates `history`, so this tail slice
3970        // stays valid for the routine's duration (same as the other parsers).
3971        let current: &[u8] = unsafe { core::slice::from_raw_parts(current_ptr, current_len) };
3972        // Full contiguous live region (owned: dict + prior blocks + current
3973        // block in `history`; borrowed: `[0, block_end)` of the in-place
3974        // input) as a raw slice, for the explicit repcode probe: a rep offset
3975        // can point before the current block, which `current` can't reach.
3976        // `live_history()` is borrowed-aware; reborrow-then-raw-ptr so the
3977        // slice holds NO borrow and coexists with the `&mut self` collector
3978        // calls below. Same no-realloc validity contract as `current`.
3979        let history_abs_start = $self.table.history_abs_start;
3980        let concat_full: &[u8] = unsafe {
3981            let lh = $self.table.live_history();
3982            core::slice::from_raw_parts(lh.as_ptr(), lh.len())
3983        };
3984        let current_abs_end = current_abs_start + current_len;
3985        $self
3986            .table
3987            .apply_limited_update_after_long_match(current_abs_start);
3988        $self
3989            .table
3990            .backfill_boundary_positions(current_abs_start, current_abs_end);
3991
3992        let profile = HcOptimalCostProfile::const_for_strategy::<super::strategy::Btlazy2>();
3993        let mut candidates = core::mem::take(&mut $self.backend.bt_mut().opt_candidates_scratch);
3994
3995        let depth = $self.hc.lazy_depth as usize;
3996        let mut pos = 0usize;
3997        let mut literals_start = 0usize;
3998
3999        // Collect + select the highest-GAIN match at a position (upstream zstd
4000        // `ZSTD_searchMax` plus the explicit offset_1 repcode check): scan the
4001        // length-sorted BT/dms ladder by gain, then probe rep0 directly since
4002        // the ladder's strictly-increasing-length filter drops short cheap
4003        // reps. Expands to `(match_len, offset)`; `match_len == 0` = no match.
4004        macro_rules! bt_select {
4005            ($p:expr) => {{
4006                let sel_pos: usize = $p;
4007                // `ll0` (upstream zstd): zero literals pending before this position, so
4008                // the repcode set is shifted (see `btlazy2_offbase`).
4009                let ll0 = sel_pos == literals_start;
4010                let sel_abs = current_abs_start + sel_pos;
4011                candidates.clear();
4012                let query = HcCandidateQuery {
4013                    reps: $self.table.offset_hist,
4014                    lit_len: sel_pos - literals_start,
4015                    // No LDM seed: L13-15 run at windowLog 22, below upstream zstd's
4016                    // LDM auto-enable threshold (windowLog >= 27).
4017                    ldm_candidate: None,
4018                };
4019                // SAFETY: called inside the wrapper's `#[target_feature]`
4020                // umbrella (the scalar wrapper's `$collect` is a safe fn).
4021                unsafe {
4022                    $self.$collect::<super::strategy::Btlazy2, true>(
4023                        sel_abs,
4024                        current_abs_end,
4025                        profile,
4026                        query,
4027                        &mut candidates,
4028                    );
4029                }
4030                let reps = $self.table.offset_hist;
4031                let mut sel_ml = 0usize;
4032                let mut sel_off = 0usize;
4033                let mut sel_gain = i64::MIN;
4034                for c in candidates.iter() {
4035                    let ml = c.match_len.min(current_len - sel_pos);
4036                    if ml < HC_OPT_MIN_MATCH_LEN {
4037                        continue;
4038                    }
4039                    let g = btlazy2_gain(ml, c.offset, reps, ll0);
4040                    if g > sel_gain {
4041                        sel_gain = g;
4042                        sel_ml = ml;
4043                        sel_off = c.offset;
4044                    }
4045                }
4046                let sel_idx = sel_abs - history_abs_start;
4047                // Upstream zstd probes `rep[0 + ll0]` directly (the length-sorted ladder
4048                // drops short cheap reps): rep0 normally, rep1 at a zero-literal
4049                // position where rep0 is not the cheapest code.
4050                let probe_rep = if ll0 {
4051                    reps[1] as usize
4052                } else {
4053                    reps[0] as usize
4054                };
4055                if probe_rep != 0 && sel_idx >= probe_rep {
4056                    let tail = current_len - sel_pos;
4057                    // SAFETY: `sel_idx - probe_rep < sel_idx`, `sel_idx + tail <=
4058                    // concat_full.len()`; same overshoot slack the collector
4059                    // relies on for this block.
4060                    let rep_ml =
4061                        unsafe { $cmf(concat_full, sel_idx, sel_idx - probe_rep, tail, 0) };
4062                    if rep_ml >= HC_OPT_MIN_MATCH_LEN
4063                        && btlazy2_gain(rep_ml, probe_rep, reps, ll0) > sel_gain
4064                    {
4065                        sel_ml = rep_ml;
4066                        sel_off = probe_rep;
4067                    }
4068                }
4069                (sel_ml, sel_off)
4070            }};
4071        }
4072
4073        while pos + HC_OPT_MIN_MATCH_LEN <= current_len {
4074            let (mut best_ml, mut best_off) = bt_select!(pos);
4075            if best_ml < HC_OPT_MIN_MATCH_LEN {
4076                pos += 1;
4077                continue;
4078            }
4079            // Lazy lookahead (upstream zstd depth 1/2): advance one byte and accept the
4080            // later match only if it out-gains the current one by the upstream zstd
4081            // margin (deferring costs an extra literal — `+4` at depth 1, `+7`
4082            // at depth 2). `start` tracks where the chosen match begins.
4083            let mut start = pos;
4084            let mut d = 0usize;
4085            while d < depth && start + 1 + HC_OPT_MIN_MATCH_LEN <= current_len {
4086                let look = start + 1;
4087                let (ml2, off2) = bt_select!(look);
4088                if ml2 < HC_OPT_MIN_MATCH_LEN {
4089                    break;
4090                }
4091                let reps = $self.table.offset_hist;
4092                let margin = if d == 0 { 4 } else { 7 };
4093                // `best` sits at `start` (ll0 iff no literals precede it); the
4094                // lookahead match at `start + 1` always has a pending literal.
4095                let gain1 = btlazy2_gain(best_ml, best_off, reps, start == literals_start) + margin;
4096                let gain2 = btlazy2_gain(ml2, off2, reps, false);
4097                if gain2 > gain1 {
4098                    best_ml = ml2;
4099                    best_off = off2;
4100                    start = look;
4101                    d += 1;
4102                } else {
4103                    break;
4104                }
4105            }
4106            // Commit the chosen match at `start`; [literals_start, start) is
4107            // emitted as literals. `best_ml` was bounded to `current_len -
4108            // start` at selection, so `start + best_ml <= current_len`.
4109            let lit_len = start - literals_start;
4110            let literals = &current[literals_start..start];
4111            $handle_sequence(Sequence::Triple {
4112                literals,
4113                offset: best_off,
4114                match_len: best_ml,
4115            });
4116            let _ = encode_offset_with_history(
4117                best_off as u32,
4118                lit_len as u32,
4119                &mut $self.table.offset_hist,
4120            );
4121            pos = start + best_ml;
4122            literals_start = pos;
4123        }
4124
4125        if literals_start < current_len {
4126            $handle_sequence(Sequence::Literals {
4127                literals: &current[literals_start..],
4128            });
4129        }
4130        $self.backend.bt_mut().opt_candidates_scratch = candidates;
4131    }};
4132}
4133
4134/// 8-lane `next_cost < node_price` mask for the optimal-parser price-set
4135/// loop. AVX2 lacks an unsigned `cmplt`, so derive `nc < np` from
4136/// `min_epu32`: `nc <= np` iff `min(nc,np) == nc`, then exclude equality.
4137/// Returns a bitmask (bit `k` set => lane `k` improves). Scalar fallback
4138/// for non-x86 / no-AVX2.
4139/// 8-lane `next_cost < node_price` mask for the optimal-parser price-set
4140/// loop. AVX2 lacks an unsigned `cmplt`, so derive `nc < np` from
4141/// `min_epu32`: `nc <= np` iff `min(nc,np) == nc`, then exclude equality.
4142/// Returns a bitmask (bit `k` set => lane `k` improves). Compiled on every
4143/// x86 target (same as the avx2 collect kernel); the cargo `kernel_avx2`
4144/// feature only gates the runtime dispatch, not compilation.
4145#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
4146#[target_feature(enable = "avx2")]
4147unsafe fn priceset_improved_mask8_avx2(next_cost: &[u32; 8], node_price: &[u32]) -> u8 {
4148    #[cfg(target_arch = "x86")]
4149    use core::arch::x86::{
4150        __m256i, _mm256_andnot_si256, _mm256_castsi256_ps, _mm256_cmpeq_epi32, _mm256_loadu_si256,
4151        _mm256_min_epu32, _mm256_movemask_ps,
4152    };
4153    #[cfg(target_arch = "x86_64")]
4154    use core::arch::x86_64::{
4155        __m256i, _mm256_andnot_si256, _mm256_castsi256_ps, _mm256_cmpeq_epi32, _mm256_loadu_si256,
4156        _mm256_min_epu32, _mm256_movemask_ps,
4157    };
4158    let nc = unsafe { _mm256_loadu_si256(next_cost.as_ptr() as *const __m256i) };
4159    let np = unsafe { _mm256_loadu_si256(node_price.as_ptr() as *const __m256i) };
4160    let min = _mm256_min_epu32(nc, np);
4161    let le = _mm256_cmpeq_epi32(min, nc); // nc <= np
4162    let eq = _mm256_cmpeq_epi32(nc, np); // nc == np
4163    let lt = _mm256_andnot_si256(eq, le); // nc < np
4164    _mm256_movemask_ps(_mm256_castsi256_ps(lt)) as u8
4165}
4166
4167/// Inline `next_cost = base_cost + ll0_price + match_price_from_parts(off,ml)`
4168/// for one match length — the exact `add_prices` chain the scalar loop uses,
4169/// so the SoA vector path stays byte-identical.
4170#[inline(always)]
4171#[allow(clippy::too_many_arguments)]
4172fn priceset_next_cost(
4173    profile: HcOptimalCostProfile,
4174    stats: &HcOptState,
4175    ml_cache: &mut [[u32; 2]],
4176    ml_stamp: u32,
4177    match_len: usize,
4178    ll0_price: u32,
4179    off_price: u32,
4180    base_cost: u32,
4181) -> u32 {
4182    let ml_price =
4183        BtMatcher::cached_match_length_price(profile, stats, match_len, ml_cache, ml_stamp);
4184    let seq_cost = BtMatcher::add_prices(
4185        ll0_price,
4186        profile.match_price_from_parts(off_price, ml_price, stats),
4187    );
4188    BtMatcher::add_prices(base_cost, seq_cost)
4189}
4190
4191/// Scalar price-set over the match-length range `[start, max]` for the
4192/// NON-abort optimal modes (btultra / btultra2). Each `match_len` writes a
4193/// distinct node `pos + match_len`, so order is irrelevant; the improvement
4194/// test reduces to `next_cost < node_prices[next]` (`reset_opt_nodes` set
4195/// every beyond-frontier cell to `u32::MAX`, subsuming `next > last_pos`).
4196/// `#[inline]` so it folds into each per-tier optimal-parser monomorphisation
4197/// (no call overhead). Returns the highest written `next`.
4198#[inline]
4199#[allow(clippy::too_many_arguments)]
4200// Used by the scalar / sse42 DP wrappers; on aarch64 the dispatch only reaches
4201// the neon wrapper and on wasm+simd128 only the simd128 wrapper, so this is
4202// cfg-dead on those targets.
4203#[cfg_attr(
4204    any(
4205        all(target_arch = "aarch64", target_endian = "little"),
4206        all(target_arch = "wasm32", target_feature = "simd128")
4207    ),
4208    allow(dead_code)
4209)]
4210fn priceset_range_nonabort_scalar(
4211    node_prices: &mut [u32],
4212    nodes: &mut [HcOptimalNode],
4213    ml_cache: &mut [[u32; 2]],
4214    ml_stamp: u32,
4215    profile: HcOptimalCostProfile,
4216    stats: &HcOptState,
4217    pos: usize,
4218    start: usize,
4219    max: usize,
4220    ll0_price: u32,
4221    off_price: u32,
4222    base_cost: u32,
4223    off: u32,
4224    reps: [u32; 3],
4225    last_pos: usize,
4226) -> usize {
4227    let mut new_last = last_pos;
4228    for ml in start..=max {
4229        let next_cost = priceset_next_cost(
4230            profile, stats, ml_cache, ml_stamp, ml, ll0_price, off_price, base_cost,
4231        );
4232        let next = pos + ml;
4233        if next_cost < node_prices[next] {
4234            node_prices[next] = next_cost;
4235            nodes[next] = HcOptimalNode {
4236                off,
4237                mlen: ml as u32,
4238                litlen: 0,
4239                reps,
4240            };
4241            if next > new_last {
4242                new_last = next;
4243            }
4244        }
4245    }
4246    new_last
4247}
4248
4249/// Per-tier deinterleave + improve-mask correctness vs a scalar reference.
4250/// Each tier's dispatch only fires on matching hardware (i9 picks AVX2 over
4251/// SSE4.1, M1 picks NEON), so the non-dispatched tiers never run in the
4252/// roundtrip suite; this exercises the deinterleave/mask helpers directly on
4253/// whatever ISA the test host exposes (AVX2 + SSE4.1 on x86, NEON on aarch64).
4254#[cfg(test)]
4255#[test]
4256fn priceset_tier_helpers_match_scalar() {
4257    // Reference: gen-stamped contiguous cells -> ordered prices on all-warm.
4258    fn scalar_deint<const W: usize>(cells: &[[u32; 2]], stamp: u32) -> Option<[u32; W]> {
4259        let mut out = [0u32; W];
4260        for k in 0..W {
4261            if cells[k][1] != stamp {
4262                return None;
4263            }
4264            out[k] = cells[k][0];
4265        }
4266        Some(out)
4267    }
4268    fn scalar_mask<const W: usize>(nc: &[u32; W], np: &[u32]) -> u8 {
4269        let mut m = 0u8;
4270        for k in 0..W {
4271            if nc[k] < np[k] {
4272                m |= 1 << k;
4273            }
4274        }
4275        m
4276    }
4277    const S: u32 = 0x55;
4278    let warm: [[u32; 2]; 4] = [[11, S], [22, S], [33, S], [44, S]];
4279    let mut cold = warm;
4280    cold[2][1] = S ^ 1; // one stale cell -> must yield None
4281    let nc4: [u32; 4] = [10, 99, 30, 41];
4282    let np4: [u32; 4] = [20, 21, 30, 99]; // lt: lane0 (10<20), lane3 (41<99)
4283
4284    #[cfg(all(target_arch = "aarch64", target_endian = "little"))]
4285    unsafe {
4286        assert_eq!(
4287            priceset_cached_prices4_neon(&warm, S),
4288            scalar_deint::<4>(&warm, S)
4289        );
4290        assert_eq!(priceset_cached_prices4_neon(&cold, S), None);
4291        assert_eq!(
4292            priceset_improved_mask4_neon(&nc4, &np4),
4293            scalar_mask::<4>(&nc4, &np4)
4294        );
4295    }
4296    #[cfg(all(feature = "std", any(target_arch = "x86", target_arch = "x86_64")))]
4297    {
4298        if std::is_x86_feature_detected!("sse4.2") {
4299            unsafe {
4300                assert_eq!(
4301                    priceset_cached_prices4_sse41(&warm, S),
4302                    scalar_deint::<4>(&warm, S)
4303                );
4304                assert_eq!(priceset_cached_prices4_sse41(&cold, S), None);
4305                assert_eq!(
4306                    priceset_improved_mask4_sse41(&nc4, &np4),
4307                    scalar_mask::<4>(&nc4, &np4)
4308                );
4309            }
4310        }
4311        if std::is_x86_feature_detected!("avx2") {
4312            let warm8: [[u32; 2]; 8] = [
4313                [11, S],
4314                [22, S],
4315                [33, S],
4316                [44, S],
4317                [55, S],
4318                [66, S],
4319                [77, S],
4320                [88, S],
4321            ];
4322            let mut cold8 = warm8;
4323            cold8[5][1] = S ^ 1;
4324            let nc8: [u32; 8] = [10, 99, 30, 41, 99, 60, 99, 80];
4325            let np8: [u32; 8] = [20, 21, 30, 99, 50, 99, 70, 99];
4326            unsafe {
4327                assert_eq!(
4328                    priceset_cached_prices8_avx2(&warm8, S),
4329                    scalar_deint::<8>(&warm8, S)
4330                );
4331                assert_eq!(priceset_cached_prices8_avx2(&cold8, S), None);
4332                assert_eq!(
4333                    priceset_improved_mask8_avx2(&nc8, &np8),
4334                    scalar_mask::<8>(&nc8, &np8)
4335                );
4336            }
4337        }
4338    }
4339}
4340
4341/// Shared vectorised price-set loop body, generic over the SIMD width `W`.
4342/// The per-tier `deint` (vector-load plus deinterleave of `W` cached prices,
4343/// returning `Some` only on an all-warm chunk) and `mask` (per-tier
4344/// `next_cost` less-than `node_price` bitmask) are passed as zero-sized
4345/// `impl Fn`s. `#[inline(always)]` plus monomorphisation folds `deint` and
4346/// `mask` directly into each per-tier wrapper's `target_feature` umbrella, so
4347/// the intrinsics inline with no call ABI and no runtime feature detection.
4348/// Cold or out-of-cache chunks, and the sub-`W` remainder, fall back to the
4349/// scalar `priceset_next_cost` (which fills the cache); writes are
4350/// scalar-scatter on the improving lanes (1-8% of compares, per the
4351/// improve-ratio probe). Same signature tail as the scalar variant.
4352#[inline(always)]
4353#[allow(clippy::too_many_arguments)]
4354// Instantiated only by a vector tier wrapper (avx2/sse4.1 on x86, neon on
4355// aarch64, simd128 on wasm+simd128); a target with none of those (e.g.
4356// wasm without +simd128) uses only the scalar range, leaving this generic dead.
4357#[cfg_attr(
4358    not(any(
4359        target_arch = "x86",
4360        target_arch = "x86_64",
4361        all(target_arch = "aarch64", target_endian = "little"),
4362        all(target_arch = "wasm32", target_feature = "simd128")
4363    )),
4364    allow(dead_code)
4365)]
4366fn priceset_range_vec<const W: usize>(
4367    node_prices: &mut [u32],
4368    nodes: &mut [HcOptimalNode],
4369    ml_cache: &mut [[u32; 2]],
4370    ml_stamp: u32,
4371    profile: HcOptimalCostProfile,
4372    stats: &HcOptState,
4373    pos: usize,
4374    start: usize,
4375    max: usize,
4376    ll0_price: u32,
4377    off_price: u32,
4378    base_cost: u32,
4379    off: u32,
4380    reps: [u32; 3],
4381    last_pos: usize,
4382    deint: impl Fn(&[[u32; 2]], u32) -> Option<[u32; W]>,
4383    mask: impl Fn(&[u32; W], &[u32]) -> u8,
4384) -> usize {
4385    let mut new_last = last_pos;
4386    let mut buf = [0u32; W];
4387    // Loop-invariant constant of the byte-identical next_cost chain:
4388    // next_cost = add_prices(base_cost, add_prices(ll0_price,
4389    //   match_price_from_parts(off_price, ml_price))) = c_base + ml_price,
4390    // c_base = base_cost + ll0_price + match_price_from_parts(off_price, 0).
4391    //
4392    // This stays bit-exact with the scalar `priceset_next_cost` because both
4393    // helpers are affine in `ml_price`: `BtMatcher::add_prices(a, b) = a + b`
4394    // and `match_price_from_parts(off, ml) = off + ml + bias` are plain integer
4395    // additions, so `match_price_from_parts(off, ml) = match_price_from_parts(
4396    // off, 0) + ml` and the whole chain collapses to `c_base + ml_price`. The
4397    // `wrapping_add` here matches the scalar `+` under the cost model's
4398    // no-overflow invariant (the `debug_assert`s in both helpers). Factoring the
4399    // combine into one helper per the review suggestion would force a per-lane
4400    // `match_price_from_parts(off, ml_price)` recompute instead of hoisting the
4401    // ml-independent `c_base` once — a regression on this hot DP loop — so the
4402    // hoist is kept and the equivalence documented here instead.
4403    let c_base = base_cost
4404        .wrapping_add(ll0_price)
4405        .wrapping_add(profile.match_price_from_parts(off_price, 0, stats));
4406    let mut ml = start;
4407    while ml + W <= max + 1 {
4408        let vectorised = if ml + W <= ml_cache.len() {
4409            deint(&ml_cache[ml..ml + W], ml_stamp)
4410        } else {
4411            None
4412        };
4413        if let Some(prices) = vectorised {
4414            for (k, slot) in buf.iter_mut().enumerate() {
4415                *slot = c_base.wrapping_add(prices[k]);
4416            }
4417        } else {
4418            for (k, slot) in buf.iter_mut().enumerate() {
4419                *slot = priceset_next_cost(
4420                    profile,
4421                    stats,
4422                    ml_cache,
4423                    ml_stamp,
4424                    ml + k,
4425                    ll0_price,
4426                    off_price,
4427                    base_cost,
4428                );
4429            }
4430        }
4431        let base_next = pos + ml;
4432        let mut bits = mask(&buf, &node_prices[base_next..base_next + W]);
4433        while bits != 0 {
4434            let k = bits.trailing_zeros() as usize;
4435            bits &= bits - 1;
4436            let next = base_next + k;
4437            node_prices[next] = buf[k];
4438            nodes[next] = HcOptimalNode {
4439                off,
4440                mlen: (ml + k) as u32,
4441                litlen: 0,
4442                reps,
4443            };
4444            if next > new_last {
4445                new_last = next;
4446            }
4447        }
4448        ml += W;
4449    }
4450    while ml <= max {
4451        let next_cost = priceset_next_cost(
4452            profile, stats, ml_cache, ml_stamp, ml, ll0_price, off_price, base_cost,
4453        );
4454        let next = pos + ml;
4455        if next_cost < node_prices[next] {
4456            node_prices[next] = next_cost;
4457            nodes[next] = HcOptimalNode {
4458                off,
4459                mlen: ml as u32,
4460                litlen: 0,
4461                reps,
4462            };
4463            if next > new_last {
4464                new_last = next;
4465            }
4466        }
4467        ml += 1;
4468    }
4469    new_last
4470}
4471
4472/// Vector-load 8 cached ml-prices for the optimal parser's price-set, given a
4473/// run of 8 contiguous `[price, generation]` cells. Returns `Some(prices)`
4474/// only when ALL eight cells are warm (`generation == stamp`) — the common
4475/// (~91-98%) case — so the caller can fold them with one broadcast constant;
4476/// any cold cell returns `None` to route the chunk through the scalar fill
4477/// (which recomputes + repopulates the misses). Deinterleaves with cheap
4478/// in-128-lane ops (`shuffle_epi32` + `unpack*_epi64`) and a single cross-lane
4479/// `permute4x64` for the ordered prices — avoiding the latency-bound chain of
4480/// cross-lane `permutevar8x32`s that lost to pipelined scalar loads on
4481/// high-chunk-count fixtures.
4482#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
4483#[target_feature(enable = "avx2")]
4484#[inline]
4485unsafe fn priceset_cached_prices8_avx2(cells: &[[u32; 2]], stamp: u32) -> Option<[u32; 8]> {
4486    #[cfg(target_arch = "x86")]
4487    use core::arch::x86::{
4488        __m256i, _mm256_castsi256_ps, _mm256_cmpeq_epi32, _mm256_loadu_si256, _mm256_movemask_ps,
4489        _mm256_permute4x64_epi64, _mm256_set1_epi32, _mm256_shuffle_epi32, _mm256_storeu_si256,
4490        _mm256_unpackhi_epi64, _mm256_unpacklo_epi64,
4491    };
4492    #[cfg(target_arch = "x86_64")]
4493    use core::arch::x86_64::{
4494        __m256i, _mm256_castsi256_ps, _mm256_cmpeq_epi32, _mm256_loadu_si256, _mm256_movemask_ps,
4495        _mm256_permute4x64_epi64, _mm256_set1_epi32, _mm256_shuffle_epi32, _mm256_storeu_si256,
4496        _mm256_unpackhi_epi64, _mm256_unpacklo_epi64,
4497    };
4498    debug_assert!(cells.len() >= 8);
4499    let base = cells.as_ptr() as *const __m256i;
4500    // v0 = [p0 g0 p1 g1 | p2 g2 p3 g3], v1 = [p4 g4 p5 g5 | p6 g6 p7 g7].
4501    let v0 = unsafe { _mm256_loadu_si256(base) };
4502    let v1 = unsafe { _mm256_loadu_si256(base.add(1)) };
4503    // In-128-lane group prices then gens: [p g p g] -> [p p g g] (control 0xD8).
4504    let s0 = _mm256_shuffle_epi32(v0, 0xD8); // [p0 p1 g0 g1 | p2 p3 g2 g3]
4505    let s1 = _mm256_shuffle_epi32(v1, 0xD8); // [p4 p5 g4 g5 | p6 p7 g6 g7]
4506    // Gens (hi 64 of each 128-lane) — order irrelevant for the all-equal test.
4507    let gens = _mm256_unpackhi_epi64(s0, s1);
4508    let eq = _mm256_cmpeq_epi32(gens, _mm256_set1_epi32(stamp as i32));
4509    if _mm256_movemask_ps(_mm256_castsi256_ps(eq)) as u8 != 0xFF {
4510        return None;
4511    }
4512    // Prices (lo 64 of each 128-lane): [p0 p1 p4 p5 | p2 p3 p6 p7] as 64-bit
4513    // chunks [c0 c1 c2 c3] = [p0p1 p4p5 p2p3 p6p7]; reorder to [c0 c2 c1 c3]
4514    // (control 0xD8) for in-order [p0..p7].
4515    let p_scrambled = _mm256_unpacklo_epi64(s0, s1);
4516    let prices = _mm256_permute4x64_epi64(p_scrambled, 0xD8);
4517    let mut out = [0u32; 8];
4518    unsafe { _mm256_storeu_si256(out.as_mut_ptr() as *mut __m256i, prices) };
4519    Some(out)
4520}
4521
4522#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
4523#[target_feature(enable = "avx2")]
4524#[inline]
4525#[allow(clippy::too_many_arguments)]
4526unsafe fn priceset_range_nonabort_avx2(
4527    node_prices: &mut [u32],
4528    nodes: &mut [HcOptimalNode],
4529    ml_cache: &mut [[u32; 2]],
4530    ml_stamp: u32,
4531    profile: HcOptimalCostProfile,
4532    stats: &HcOptState,
4533    pos: usize,
4534    start: usize,
4535    max: usize,
4536    ll0_price: u32,
4537    off_price: u32,
4538    base_cost: u32,
4539    off: u32,
4540    reps: [u32; 3],
4541    last_pos: usize,
4542) -> usize {
4543    priceset_range_vec::<8>(
4544        node_prices,
4545        nodes,
4546        ml_cache,
4547        ml_stamp,
4548        profile,
4549        stats,
4550        pos,
4551        start,
4552        max,
4553        ll0_price,
4554        off_price,
4555        base_cost,
4556        off,
4557        reps,
4558        last_pos,
4559        // SAFETY: both closures run inside this fn's avx2 target_feature umbrella.
4560        |cells, stamp| unsafe { priceset_cached_prices8_avx2(cells, stamp) },
4561        |nc, np| unsafe { priceset_improved_mask8_avx2(nc, np) },
4562    )
4563}
4564
4565/// NEON 4-lane vector-load + deinterleave of cached ml-prices. `vld2q_u32`
4566/// deinterleaves the 4 contiguous `[price, generation]` pairs natively into
4567/// two registers (prices, gens) — no shuffle chain. `Some(prices)` only when
4568/// all 4 generations equal `stamp` (`vminvq` of the equality mask is all-ones).
4569#[cfg(all(target_arch = "aarch64", target_endian = "little"))]
4570#[target_feature(enable = "neon")]
4571#[inline]
4572unsafe fn priceset_cached_prices4_neon(cells: &[[u32; 2]], stamp: u32) -> Option<[u32; 4]> {
4573    use core::arch::aarch64::{vceqq_u32, vdupq_n_u32, vld2q_u32, vminvq_u32, vst1q_u32};
4574    debug_assert!(cells.len() >= 4);
4575    // SAFETY: caller's neon umbrella; `cells` is >= 4 pairs = 8 contiguous u32.
4576    let pair = unsafe { vld2q_u32(cells.as_ptr() as *const u32) };
4577    let eq = vceqq_u32(pair.1, vdupq_n_u32(stamp));
4578    if vminvq_u32(eq) != u32::MAX {
4579        return None;
4580    }
4581    let mut out = [0u32; 4];
4582    unsafe { vst1q_u32(out.as_mut_ptr(), pair.0) };
4583    Some(out)
4584}
4585
4586/// NEON 4-lane `next_cost < node_price` bitmask. NEON has an unsigned compare
4587/// (`vcltq_u32`) but no movemask; AND the all-ones lane mask with lane weights
4588/// `[1,2,4,8]` and horizontal-add (`vaddvq_u32`) to pack the 4 bits.
4589#[cfg(all(target_arch = "aarch64", target_endian = "little"))]
4590#[target_feature(enable = "neon")]
4591#[inline]
4592unsafe fn priceset_improved_mask4_neon(next_cost: &[u32; 4], node_price: &[u32]) -> u8 {
4593    use core::arch::aarch64::{vaddvq_u32, vandq_u32, vcltq_u32, vld1q_u32, vst1q_u32};
4594    // SAFETY: neon umbrella; both spans are 4 u32 wide.
4595    let nc = unsafe { vld1q_u32(next_cost.as_ptr()) };
4596    let np = unsafe { vld1q_u32(node_price.as_ptr()) };
4597    let lt = vcltq_u32(nc, np);
4598    let weights: [u32; 4] = [1, 2, 4, 8];
4599    let w = unsafe { vld1q_u32(weights.as_ptr()) };
4600    let bits = vandq_u32(lt, w);
4601    let _ = vst1q_u32; // silence unused import on some toolchains
4602    vaddvq_u32(bits) as u8
4603}
4604
4605#[cfg(all(target_arch = "aarch64", target_endian = "little"))]
4606#[target_feature(enable = "neon")]
4607#[inline]
4608#[allow(clippy::too_many_arguments)]
4609unsafe fn priceset_range_nonabort_neon(
4610    node_prices: &mut [u32],
4611    nodes: &mut [HcOptimalNode],
4612    ml_cache: &mut [[u32; 2]],
4613    ml_stamp: u32,
4614    profile: HcOptimalCostProfile,
4615    stats: &HcOptState,
4616    pos: usize,
4617    start: usize,
4618    max: usize,
4619    ll0_price: u32,
4620    off_price: u32,
4621    base_cost: u32,
4622    off: u32,
4623    reps: [u32; 3],
4624    last_pos: usize,
4625) -> usize {
4626    priceset_range_vec::<4>(
4627        node_prices,
4628        nodes,
4629        ml_cache,
4630        ml_stamp,
4631        profile,
4632        stats,
4633        pos,
4634        start,
4635        max,
4636        ll0_price,
4637        off_price,
4638        base_cost,
4639        off,
4640        reps,
4641        last_pos,
4642        // SAFETY: both closures run inside this fn's neon target_feature umbrella.
4643        |cells, stamp| unsafe { priceset_cached_prices4_neon(cells, stamp) },
4644        |nc, np| unsafe { priceset_improved_mask4_neon(nc, np) },
4645    )
4646}
4647
4648/// SSE4.1 4-lane vector-load + deinterleave of cached ml-prices. Two 128-bit
4649/// loads of `[price, gen]` pairs, `shuffle_epi32(0xD8)` groups prices then gens
4650/// within each, `unpacklo/hi_epi64` separates them. `Some(prices)` only when
4651/// all 4 generations equal `stamp`.
4652#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
4653#[target_feature(enable = "sse4.2")]
4654#[inline]
4655unsafe fn priceset_cached_prices4_sse41(cells: &[[u32; 2]], stamp: u32) -> Option<[u32; 4]> {
4656    #[cfg(target_arch = "x86")]
4657    use core::arch::x86::{
4658        __m128i, _mm_castsi128_ps, _mm_cmpeq_epi32, _mm_loadu_si128, _mm_movemask_ps,
4659        _mm_set1_epi32, _mm_shuffle_epi32, _mm_storeu_si128, _mm_unpackhi_epi64,
4660        _mm_unpacklo_epi64,
4661    };
4662    #[cfg(target_arch = "x86_64")]
4663    use core::arch::x86_64::{
4664        __m128i, _mm_castsi128_ps, _mm_cmpeq_epi32, _mm_loadu_si128, _mm_movemask_ps,
4665        _mm_set1_epi32, _mm_shuffle_epi32, _mm_storeu_si128, _mm_unpackhi_epi64,
4666        _mm_unpacklo_epi64,
4667    };
4668    debug_assert!(cells.len() >= 4);
4669    let base = cells.as_ptr() as *const __m128i;
4670    let v0 = unsafe { _mm_loadu_si128(base) }; // [p0 g0 p1 g1]
4671    let v1 = unsafe { _mm_loadu_si128(base.add(1)) }; // [p2 g2 p3 g3]
4672    let s0 = _mm_shuffle_epi32(v0, 0xD8); // [p0 p1 g0 g1]
4673    let s1 = _mm_shuffle_epi32(v1, 0xD8); // [p2 p3 g2 g3]
4674    let gens = _mm_unpackhi_epi64(s0, s1); // [g0 g1 g2 g3]
4675    let eq = _mm_cmpeq_epi32(gens, _mm_set1_epi32(stamp as i32));
4676    if _mm_movemask_ps(_mm_castsi128_ps(eq)) as u8 & 0x0F != 0x0F {
4677        return None;
4678    }
4679    let prices = _mm_unpacklo_epi64(s0, s1); // [p0 p1 p2 p3]
4680    let mut out = [0u32; 4];
4681    unsafe { _mm_storeu_si128(out.as_mut_ptr() as *mut __m128i, prices) };
4682    Some(out)
4683}
4684
4685/// SSE4.1 4-lane `next_cost < node_price` bitmask (unsigned compare via
4686/// `min_epu32`, like the AVX2 path).
4687#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
4688#[target_feature(enable = "sse4.2")]
4689#[inline]
4690unsafe fn priceset_improved_mask4_sse41(next_cost: &[u32; 4], node_price: &[u32]) -> u8 {
4691    #[cfg(target_arch = "x86")]
4692    use core::arch::x86::{
4693        __m128i, _mm_andnot_si128, _mm_castsi128_ps, _mm_cmpeq_epi32, _mm_loadu_si128,
4694        _mm_min_epu32, _mm_movemask_ps,
4695    };
4696    #[cfg(target_arch = "x86_64")]
4697    use core::arch::x86_64::{
4698        __m128i, _mm_andnot_si128, _mm_castsi128_ps, _mm_cmpeq_epi32, _mm_loadu_si128,
4699        _mm_min_epu32, _mm_movemask_ps,
4700    };
4701    let nc = unsafe { _mm_loadu_si128(next_cost.as_ptr() as *const __m128i) };
4702    let np = unsafe { _mm_loadu_si128(node_price.as_ptr() as *const __m128i) };
4703    let min = _mm_min_epu32(nc, np);
4704    let le = _mm_cmpeq_epi32(min, nc);
4705    let eq = _mm_cmpeq_epi32(nc, np);
4706    let lt = _mm_andnot_si128(eq, le);
4707    (_mm_movemask_ps(_mm_castsi128_ps(lt)) as u8) & 0x0F
4708}
4709
4710#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
4711#[target_feature(enable = "sse4.2")]
4712#[inline]
4713#[allow(clippy::too_many_arguments)]
4714unsafe fn priceset_range_nonabort_sse41(
4715    node_prices: &mut [u32],
4716    nodes: &mut [HcOptimalNode],
4717    ml_cache: &mut [[u32; 2]],
4718    ml_stamp: u32,
4719    profile: HcOptimalCostProfile,
4720    stats: &HcOptState,
4721    pos: usize,
4722    start: usize,
4723    max: usize,
4724    ll0_price: u32,
4725    off_price: u32,
4726    base_cost: u32,
4727    off: u32,
4728    reps: [u32; 3],
4729    last_pos: usize,
4730) -> usize {
4731    priceset_range_vec::<4>(
4732        node_prices,
4733        nodes,
4734        ml_cache,
4735        ml_stamp,
4736        profile,
4737        stats,
4738        pos,
4739        start,
4740        max,
4741        ll0_price,
4742        off_price,
4743        base_cost,
4744        off,
4745        reps,
4746        last_pos,
4747        // SAFETY: both closures run inside this fn's sse4.2 target_feature umbrella.
4748        |cells, stamp| unsafe { priceset_cached_prices4_sse41(cells, stamp) },
4749        |nc, np| unsafe { priceset_improved_mask4_sse41(nc, np) },
4750    )
4751}
4752
4753/// wasm `simd128` 4-lane vector-load + deinterleave of cached ml-prices.
4754/// `u32x4_shuffle` selects the price (even) and gen (odd) lanes across the two
4755/// loaded vectors natively. `Some(prices)` only when all 4 gens equal `stamp`
4756/// (`u32x4_all_true` of the equality vector).
4757#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
4758#[target_feature(enable = "simd128")]
4759#[inline]
4760unsafe fn priceset_cached_prices4_simd128(cells: &[[u32; 2]], stamp: u32) -> Option<[u32; 4]> {
4761    use core::arch::wasm32::{
4762        u32x4_all_true, u32x4_eq, u32x4_shuffle, u32x4_splat, v128, v128_load, v128_store,
4763    };
4764    debug_assert!(cells.len() >= 4);
4765    let base = cells.as_ptr() as *const v128;
4766    let v0 = unsafe { v128_load(base) }; // [p0 g0 p1 g1]
4767    let v1 = unsafe { v128_load(base.add(1)) }; // [p2 g2 p3 g3]
4768    // Lanes 0..3 index v0, 4..7 index v1.
4769    let gens = u32x4_shuffle::<1, 3, 5, 7>(v0, v1); // [g0 g1 g2 g3]
4770    let eq = u32x4_eq(gens, u32x4_splat(stamp));
4771    if !u32x4_all_true(eq) {
4772        return None;
4773    }
4774    let prices = u32x4_shuffle::<0, 2, 4, 6>(v0, v1); // [p0 p1 p2 p3]
4775    let mut out = [0u32; 4];
4776    unsafe { v128_store(out.as_mut_ptr() as *mut v128, prices) };
4777    Some(out)
4778}
4779
4780/// wasm `simd128` 4-lane `next_cost < node_price` bitmask. wasm has a native
4781/// unsigned compare (`u32x4_lt`) and `u32x4_bitmask` to pack the lanes.
4782#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
4783#[target_feature(enable = "simd128")]
4784#[inline]
4785unsafe fn priceset_improved_mask4_simd128(next_cost: &[u32; 4], node_price: &[u32]) -> u8 {
4786    use core::arch::wasm32::{u32x4_bitmask, u32x4_lt, v128, v128_load};
4787    let nc = unsafe { v128_load(next_cost.as_ptr() as *const v128) };
4788    let np = unsafe { v128_load(node_price.as_ptr() as *const v128) };
4789    u32x4_bitmask(u32x4_lt(nc, np))
4790}
4791
4792#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
4793#[target_feature(enable = "simd128")]
4794#[inline]
4795#[allow(clippy::too_many_arguments)]
4796unsafe fn priceset_range_nonabort_simd128(
4797    node_prices: &mut [u32],
4798    nodes: &mut [HcOptimalNode],
4799    ml_cache: &mut [[u32; 2]],
4800    ml_stamp: u32,
4801    profile: HcOptimalCostProfile,
4802    stats: &HcOptState,
4803    pos: usize,
4804    start: usize,
4805    max: usize,
4806    ll0_price: u32,
4807    off_price: u32,
4808    base_cost: u32,
4809    off: u32,
4810    reps: [u32; 3],
4811    last_pos: usize,
4812) -> usize {
4813    priceset_range_vec::<4>(
4814        node_prices,
4815        nodes,
4816        ml_cache,
4817        ml_stamp,
4818        profile,
4819        stats,
4820        pos,
4821        start,
4822        max,
4823        ll0_price,
4824        off_price,
4825        base_cost,
4826        off,
4827        reps,
4828        last_pos,
4829        // SAFETY: both closures run inside this fn's simd128 target_feature umbrella.
4830        |cells, stamp| unsafe { priceset_cached_prices4_simd128(cells, stamp) },
4831        |nc, np| unsafe { priceset_improved_mask4_simd128(nc, np) },
4832    )
4833}
4834
4835macro_rules! build_optimal_plan_impl_body {
4836    (
4837        $self:expr,
4838        $strategy_ty:ty,
4839        $current:ident,
4840        $current_abs_start:ident,
4841        $current_len:ident,
4842        $initial_state:ident,
4843        $stats:ident,
4844        $out:ident,
4845        $collect:ident,
4846        $priceset:path $(,)?
4847    ) => {{
4848        let current_abs_end = $current_abs_start + $current_len;
4849        let min_match_len = HC_OPT_MIN_MATCH_LEN;
4850        // `HC_OPT_NUM > 0` by const definition, so `HC_OPT_NUM - 1` is safe.
4851        let frontier_limit = $current_len.min(HC_OPT_NUM - 1);
4852        let initial_reps = $initial_state.reps;
4853        let initial_litlen = $initial_state.litlen;
4854        let ldm_block_offset = $initial_state.block_offset;
4855        let mut profile = $initial_state.profile;
4856        profile.sufficient_match_len = $self.hc.sufficient_match_len_for_pass(profile);
4857        // Const-fold from the strategy's associated `OPT_LEVEL`
4858        // (upstream zstd `optLevel`): BtOpt = 0, BtUltra / BtUltra2 = 2.
4859        // The two flags below are the only places the inner DP loop
4860        // used to consult `parse_mode`; lifting them into const
4861        // expressions drops one indirect read + one branch on every
4862        // candidate insertion and every traceback step.
4863        // `let` (not `const`) — nested `const` items inside a
4864        // generic fn cannot project through the outer fn's type
4865        // parameter, but a `let` binding from a const expression
4866        // does get folded by the optimiser per monomorphisation,
4867        // which is what we actually want here.
4868        debug_assert!(
4869            <$strategy_ty as super::strategy::Strategy>::USE_BT,
4870            "build_optimal_plan_impl_body called on non-BT strategy"
4871        );
4872        let abort_on_worse_match: bool =
4873            <$strategy_ty as super::strategy::Strategy>::OPT_LEVEL == 0;
4874        let opt_level: bool = <$strategy_ty as super::strategy::Strategy>::OPT_LEVEL >= 2;
4875        let mut nodes = core::mem::take(&mut $self.backend.bt_mut().opt_nodes_scratch);
4876        let mut node_prices = core::mem::take(&mut $self.backend.bt_mut().opt_node_prices_scratch);
4877        // `frontier_limit + 2 <= HC_OPT_NODE_LEN` — bounded by const.
4878        let frontier_buffer_size = frontier_limit + 2;
4879        if nodes.len() < HC_OPT_NODE_LEN {
4880            // First optimal-parse use (empty boxed slice) or an undersized
4881            // buffer: allocate the fixed upstream-zstd-sized frontier once. The DP
4882            // overwrites the active prefix before reading it.
4883            nodes = alloc::vec![HcOptimalNode::default(); HC_OPT_NODE_LEN].into_boxed_slice();
4884        }
4885        // The DP price array, same fixed length as `nodes`. This is the SOLE
4886        // home of each position's price (the node struct carries no price), so
4887        // the SIMD price-set vector-loads it directly. Initialised to u32::MAX
4888        // so unwritten frontier cells compare as "unreachable".
4889        if node_prices.len() < HC_OPT_NODE_LEN {
4890            node_prices = alloc::vec![u32::MAX; HC_OPT_NODE_LEN].into_boxed_slice();
4891        }
4892        let mut candidates = core::mem::take(&mut $self.backend.bt_mut().opt_candidates_scratch);
4893        candidates.clear();
4894        if candidates.capacity() < MAX_HC_SEARCH_DEPTH {
4895            candidates.reserve_exact(MAX_HC_SEARCH_DEPTH - candidates.capacity());
4896        }
4897        let mut store = core::mem::take(&mut $self.backend.bt_mut().opt_store_scratch);
4898        store.clear();
4899        let mut price_arena = core::mem::take(&mut $self.backend.bt_mut().opt_price_arena);
4900        if price_arena.len() < HC_OPT_PRICE_ARENA_LEN {
4901            price_arena = alloc::vec![[0u32; 2]; HC_OPT_PRICE_ARENA_LEN].into_boxed_slice();
4902        }
4903        // Single arena → two disjoint fixed-stride regions of `[price,
4904        // generation]` pairs (LL cache, ML cache): one base pointer + fixed
4905        // offsets, mirroring upstream zstd's single opt workspace. Pairing
4906        // price+generation per code keeps the optimal parser's cache probe
4907        // on ONE line instead of two strided regions.
4908        // SAFETY: `price_arena` is exactly `HC_OPT_PRICE_ARENA_LEN =
4909        // 2 * HC_OPT_PRICE_STRIDE` pairs long (just ensured), so the two
4910        // STRIDE-wide regions are in bounds and disjoint. The slices alias
4911        // the heap buffer `price_arena` owns; that heap address is stable
4912        // across the later move of the `price_arena` box into the result
4913        // bundle (a `Box` move relocates only the pointer, not the heap
4914        // data), and the slices are never used after the bundle is
4915        // constructed. The fixed STRIDE (independent of `frontier_limit`)
4916        // keeps every code's cell at a constant offset so the monotonic
4917        // stamps stay valid across calls with different frontiers.
4918        let arena_base = price_arena.as_mut_ptr();
4919        let mut ll_cache: &mut [[u32; 2]] =
4920            unsafe { core::slice::from_raw_parts_mut(arena_base, HC_OPT_PRICE_STRIDE) };
4921        let mut ml_cache: &mut [[u32; 2]] = unsafe {
4922            core::slice::from_raw_parts_mut(arena_base.add(HC_OPT_PRICE_STRIDE), HC_OPT_PRICE_STRIDE)
4923        };
4924        $self.backend.bt_mut().opt_ll_price_stamp = $self
4925            .backend
4926            .bt_mut()
4927            .opt_ll_price_stamp
4928            .wrapping_add(1)
4929            .max(1);
4930        let ll_price_stamp = $self.backend.bt_mut().opt_ll_price_stamp;
4931        $self.backend.bt_mut().opt_lit_price_stamp = $self
4932            .backend
4933            .bt_mut()
4934            .opt_lit_price_stamp
4935            .wrapping_add(1)
4936            .max(1);
4937        let lit_price_stamp = $self.backend.bt_mut().opt_lit_price_stamp;
4938        $self.backend.bt_mut().opt_ml_price_stamp = $self
4939            .backend
4940            .bt_mut()
4941            .opt_ml_price_stamp
4942            .wrapping_add(1)
4943            .max(1);
4944        let ml_price_stamp = $self.backend.bt_mut().opt_ml_price_stamp;
4945        let node0_price = BtMatcher::cached_lit_length_price(
4946            profile,
4947            $stats,
4948            initial_litlen,
4949            &mut ll_cache,
4950            ll_price_stamp,
4951        );
4952        nodes[0] = HcOptimalNode {
4953            litlen: initial_litlen as u32,
4954            reps: initial_reps,
4955            ..HcOptimalNode::default()
4956        };
4957        node_prices[0] = node0_price;
4958        let sufficient_len = profile.sufficient_match_len;
4959        let ll0_price = BtMatcher::cached_lit_length_price(
4960            profile,
4961            $stats,
4962            0,
4963            &mut ll_cache,
4964            ll_price_stamp,
4965        );
4966        let ll1_price = BtMatcher::cached_lit_length_price(
4967            profile,
4968            $stats,
4969            1,
4970            &mut ll_cache,
4971            ll_price_stamp,
4972        );
4973        let mut pos = 1usize;
4974        let mut last_pos = 0usize;
4975        let mut forced_end: Option<usize> = None;
4976        let mut forced_end_state: Option<HcOptimalNode> = None;
4977        // Price companion of `forced_end_state` (price no longer lives in the
4978        // node struct; tracked alongside the forced-seed node).
4979        let mut forced_end_price: Option<u32> = None;
4980        let mut seed_forced_shortest_path = false;
4981        let mut opt_ldm = HcOptLdmState {
4982            seq_store: HcRawSeqStore {
4983                pos: 0,
4984                pos_in_sequence: 0,
4985                size: $self.backend.bt_mut().ldm_sequences.len(),
4986            },
4987            ..HcOptLdmState::default()
4988        };
4989        let has_ldm = !$self.backend.bt_mut().ldm_sequences.is_empty();
4990        if has_ldm {
4991            // `ldm_sequences` are emitted in BLOCK-relative coordinates,
4992            // but this optimal-parser pass runs over a SEGMENT of the
4993            // block starting at block-offset `$block_offset` and uses
4994            // segment-relative positions throughout. Fast-forward the raw
4995            // seq-store cursor past the bytes covered by earlier segments
4996            // so the (segment-relative) LDM windows below land at the
4997            // correct positions. Idempotent: `ldm_skip_raw_seq_store_bytes`
4998            // recomputes from `pos = 0`, so re-running it per segment is
4999            // safe. Without this, every segment after the first re-applied
5000            // the block's leading LDM windows at the wrong offset, emitting
5001            // matches that copy the wrong bytes (undecodable frame).
5002            if ldm_block_offset > 0 {
5003                $self
5004                    .backend
5005                    .bt_mut()
5006                    .ldm_skip_raw_seq_store_bytes(&mut opt_ldm.seq_store, ldm_block_offset);
5007            }
5008            $self
5009                .backend
5010                .bt_mut()
5011                .ldm_get_next_match_and_update_seq_store(&mut opt_ldm, 0, $current_len);
5012        }
5013
5014        // Upstream zstd-like seed at rPos=0: initialize frontier with matches starting
5015        // at current position before entering the generic forward DP loop.
5016        if $current_len >= min_match_len {
5017            let seed_ldm = if has_ldm {
5018                $self.backend.bt_mut().ldm_process_match_candidate(
5019                    &mut opt_ldm,
5020                    0,
5021                    $current_len,
5022                    min_match_len,
5023                )
5024            } else {
5025                None
5026            };
5027            candidates.clear();
5028            // SAFETY: wrapper is in the same target_feature umbrella as the
5029            // `$collect` kernel variant; the runtime kernel detector already
5030            // gated entry into the wrapper.
5031            unsafe {
5032                $self.$collect::<$strategy_ty, true>(
5033                    $current_abs_start,
5034                    current_abs_end,
5035                    profile,
5036                    HcCandidateQuery {
5037                        reps: initial_reps,
5038                        lit_len: initial_litlen,
5039                        ldm_candidate: seed_ldm,
5040                    },
5041                    &mut candidates,
5042                )
5043            };
5044            if !candidates.is_empty() {
5045                // `min_match_len >= HC_FORMAT_MINMATCH (3)` by invariant.
5046                last_pos = (min_match_len - 1).min(frontier_limit);
5047                for p in 1..min_match_len.min(frontier_buffer_size) {
5048                    BtMatcher::reset_opt_node(&mut nodes[p]);
5049                    // Reset the price (sole home; the node carries none).
5050                    node_prices[p] = u32::MAX;
5051                    // `initial_litlen` is the litlen carried from prior
5052                    // optimal-plan segments — its real bound is the
5053                    // current block length (the frame compressor caps
5054                    // block scan at `HC_BLOCKSIZE_MAX`), not the segment
5055                    // `current_len`. `p < min_match_len` (small constant),
5056                    // so the sum stays well within `u32::MAX`. Use
5057                    // `checked_add` FIRST so the `usize` addition itself
5058                    // cannot overflow on i686 (where `usize` is 32-bit
5059                    // and a wrapping `+` would slip past `try_from`).
5060                    let seed_litlen = initial_litlen
5061                        .checked_add(p)
5062                        .and_then(|s| u32::try_from(s).ok())
5063                        .expect("optimal parser seed litlen out of u32 range");
5064                    nodes[p].litlen = seed_litlen;
5065                }
5066            }
5067
5068            if let Some(candidate) = candidates.last() {
5069                let longest_len = candidate.match_len.min($current_len);
5070                if longest_len > sufficient_len {
5071                    let off_base = BtMatcher::encode_offset_base_with_reps(
5072                        candidate.offset as u32,
5073                        initial_litlen,
5074                        initial_reps,
5075                    );
5076                    let off_price = profile
5077                        .offset_price_for::<ACCURATE_PRICE, FAVOR_SMALL_OFFSETS>($stats, off_base);
5078                    let ml_price = BtMatcher::cached_match_length_price(
5079                        profile,
5080                        $stats,
5081                        longest_len,
5082                        &mut ml_cache,
5083                        ml_price_stamp,
5084                    );
5085                    let seq_cost = BtMatcher::add_prices(
5086                        ll0_price,
5087                        profile.match_price_from_parts(off_price, ml_price, $stats),
5088                    );
5089                    let forced_price = BtMatcher::add_prices(node_prices[0], seq_cost);
5090                    let forced_state = HcOptimalNode {
5091                        off: candidate.offset as u32,
5092                        mlen: longest_len as u32,
5093                        litlen: 0,
5094                        reps: initial_reps,
5095                    };
5096                    if longest_len < frontier_buffer_size && forced_price < node_prices[longest_len] {
5097                        nodes[longest_len] = forced_state;
5098                        node_prices[longest_len] = forced_price;
5099                    }
5100                    forced_end = Some(longest_len);
5101                    forced_end_state = Some(forced_state);
5102                    forced_end_price = Some(forced_price);
5103                    seed_forced_shortest_path = true;
5104                }
5105            }
5106            if !seed_forced_shortest_path {
5107                let mut prev_max_len = min_match_len - 1;
5108                for candidate in candidates.iter() {
5109                    let max_match_len = candidate.match_len.min(frontier_limit);
5110                    if max_match_len < min_match_len {
5111                        continue;
5112                    }
5113                    let start_len = (prev_max_len + 1).max(min_match_len);
5114                    if start_len > max_match_len {
5115                        prev_max_len = prev_max_len.max(max_match_len);
5116                        continue;
5117                    }
5118                    if max_match_len > last_pos {
5119                        BtMatcher::reset_opt_nodes(
5120                            &mut nodes,
5121                            &mut node_prices,
5122                            last_pos + 1,
5123                            max_match_len,
5124                        );
5125                    }
5126                    let off_base = BtMatcher::encode_offset_base_with_reps(
5127                        candidate.offset as u32,
5128                        initial_litlen,
5129                        initial_reps,
5130                    );
5131                    let off_price = profile
5132                        .offset_price_for::<ACCURATE_PRICE, FAVOR_SMALL_OFFSETS>($stats, off_base);
5133                    debug_assert!(max_match_len < frontier_buffer_size);
5134                    let nodes0_price = node_prices[0];
5135                    for match_len in (start_len..=max_match_len).rev() {
5136                        let ml_price = BtMatcher::cached_match_length_price(
5137                            profile,
5138                            $stats,
5139                            match_len,
5140                            &mut ml_cache,
5141                            ml_price_stamp,
5142                        );
5143                        let seq_cost = BtMatcher::add_prices(
5144                            ll0_price,
5145                            profile.match_price_from_parts(off_price, ml_price, $stats),
5146                        );
5147                        let next_cost = BtMatcher::add_prices(nodes0_price, seq_cost);
5148                        let node_price = unsafe { *node_prices.get_unchecked(match_len) };
5149                        if match_len > last_pos || next_cost < node_price {
5150                            let slot = unsafe { nodes.get_unchecked_mut(match_len) };
5151                            *slot = HcOptimalNode {
5152                                off: candidate.offset as u32,
5153                                mlen: match_len as u32,
5154                                litlen: 0,
5155                                reps: initial_reps,
5156                            };
5157                            unsafe { *node_prices.get_unchecked_mut(match_len) = next_cost };
5158                            if match_len > last_pos {
5159                                last_pos = match_len;
5160                            }
5161                        } else if abort_on_worse_match {
5162                            break;
5163                        }
5164                    }
5165                    prev_max_len = prev_max_len.max(max_match_len);
5166                }
5167                if last_pos + 1 < frontier_buffer_size {
5168                    node_prices[last_pos + 1] = u32::MAX;
5169                }
5170            }
5171        }
5172        while !seed_forced_shortest_path && pos <= last_pos && pos <= frontier_limit {
5173            debug_assert!(pos + 1 < frontier_buffer_size);
5174            let prev_node = unsafe { *nodes.get_unchecked(pos - 1) };
5175            let prev_node_price = unsafe { *node_prices.get_unchecked(pos - 1) };
5176            if prev_node_price != u32::MAX {
5177                let lit_len = prev_node.litlen as usize + 1;
5178                let lit_price = {
5179                    let bt = $self.backend.bt_mut();
5180                    BtMatcher::cached_literal_price(
5181                        profile,
5182                        $stats,
5183                        $current[pos - 1],
5184                        &mut bt.opt_lit_price_scratch,
5185                        &mut bt.opt_lit_price_generation,
5186                        lit_price_stamp,
5187                    )
5188                };
5189                let ll_delta = BtMatcher::cached_lit_length_delta_price(
5190                    profile,
5191                    $stats,
5192                    lit_len,
5193                    &mut ll_cache,
5194                    ll_price_stamp,
5195                );
5196                let lit_cost = BtMatcher::add_price_delta(prev_node_price, lit_price, ll_delta);
5197                // `node_pos_price` is the OLD price at `pos` (before the write
5198                // below) — also the price of `prev_match`, the pre-overwrite copy.
5199                let node_pos_price = unsafe { *node_prices.get_unchecked(pos) };
5200                if lit_cost <= node_pos_price {
5201                    let prev_match = unsafe { *nodes.get_unchecked(pos) };
5202                    let slot = unsafe { nodes.get_unchecked_mut(pos) };
5203                    *slot = prev_node;
5204                    slot.litlen = lit_len as u32;
5205                    node_prices[pos] = lit_cost;
5206                    #[allow(clippy::collapsible_if)]
5207                    if opt_level
5208                        && prev_match.mlen > 0
5209                        && prev_match.litlen == 0
5210                        && pos < $current_len
5211                    {
5212                        if ll1_price < ll0_price {
5213                            let next_lit_price = {
5214                                let bt = $self.backend.bt_mut();
5215                                BtMatcher::cached_literal_price(
5216                                    profile,
5217                                    $stats,
5218                                    $current[pos],
5219                                    &mut bt.opt_lit_price_scratch,
5220                                    &mut bt.opt_lit_price_generation,
5221                                    lit_price_stamp,
5222                                )
5223                            };
5224                            let with1literal = BtMatcher::add_price_delta(
5225                                node_pos_price,
5226                                next_lit_price,
5227                                ll1_price as i32 - ll0_price as i32,
5228                            );
5229                            let ll_delta_next = BtMatcher::cached_lit_length_delta_price(
5230                                profile,
5231                                $stats,
5232                                lit_len + 1,
5233                                &mut ll_cache,
5234                                ll_price_stamp,
5235                            );
5236                            let with_more_literals =
5237                                BtMatcher::add_price_delta(lit_cost, next_lit_price, ll_delta_next);
5238                            let next = pos + 1;
5239                            let next_price = unsafe { *node_prices.get_unchecked(next) };
5240                            if with1literal < with_more_literals && with1literal < next_price {
5241                                // Upstream zstd parity (zstd_opt.c:1232): `cur >= prevMatch.mlen`.
5242                                debug_assert!(pos >= prev_match.mlen as usize);
5243                                let prev_pos = pos - prev_match.mlen as usize;
5244                                {
5245                                    let prev_state = unsafe { *nodes.get_unchecked(prev_pos) };
5246                                    let (_, reps_after_match) = BtMatcher::encode_offset_with_reps(
5247                                        prev_match.off,
5248                                        prev_state.litlen as usize,
5249                                        prev_state.reps,
5250                                    );
5251                                    let slot = unsafe { nodes.get_unchecked_mut(next) };
5252                                    *slot = prev_match;
5253                                    slot.reps = reps_after_match;
5254                                    slot.litlen = 1;
5255                                    node_prices[next] = with1literal;
5256                                    if next > last_pos {
5257                                        last_pos = next;
5258                                    }
5259                                }
5260                            }
5261                        }
5262                    }
5263                }
5264            }
5265
5266            // Memory-resident DP (upstream zstd parity): read opt[cur] fields on
5267            // demand instead of holding a 28-byte node copy live across the
5268            // per-position `$collect` call below. The held copy forced LLVM
5269            // to spill reps[3] + litlen around the (non-inlinable) call;
5270            // reading the fields fresh on each side keeps them out of the
5271            // cross-call live set. `nodes[pos]` is stable across `$collect`
5272            // (it only fills `candidates`), so post-call reads are identical.
5273            let base_cost = unsafe { *node_prices.get_unchecked(pos) };
5274            if base_cost == u32::MAX {
5275                pos += 1;
5276                continue;
5277            }
5278            {
5279                let base_node = unsafe { *nodes.get_unchecked(pos) };
5280                if base_node.mlen > 0 && base_node.litlen == 0 {
5281                    // Upstream zstd parity (zstd_opt.c:1255): `cur >= opt[cur].mlen`.
5282                    debug_assert!(pos >= base_node.mlen as usize);
5283                    let prev_pos = pos - base_node.mlen as usize;
5284                    let prev_state = unsafe { *nodes.get_unchecked(prev_pos) };
5285                    let (_, reps_after_match) = BtMatcher::encode_offset_with_reps(
5286                        base_node.off,
5287                        prev_state.litlen as usize,
5288                        prev_state.reps,
5289                    );
5290                    unsafe { nodes.get_unchecked_mut(pos).reps = reps_after_match };
5291                }
5292            }
5293
5294            if pos + 8 > $current_len {
5295                pos += 1;
5296                continue;
5297            }
5298
5299            if pos == last_pos {
5300                break;
5301            }
5302
5303            let next_price = unsafe { *node_prices.get_unchecked(pos + 1) };
5304            // `saturating_add` is REQUIRED here, not a masked bug: `base_cost`
5305            // is a node price that can be the `u32::MAX` "unreachable" sentinel,
5306            // and saturating keeps `base_cost + margin` pinned at MAX so the
5307            // comparison stays correct. Plain `+` would wrap the sentinel and
5308            // flip the abort decision (a ratio bug / debug overflow panic).
5309            if abort_on_worse_match
5310                && next_price <= base_cost.saturating_add(HC_BITCOST_MULTIPLIER / 2)
5311            {
5312                pos += 1;
5313                continue;
5314            }
5315
5316            let abs_pos = $current_abs_start + pos;
5317            let ldm_candidate = if has_ldm {
5318                $self.backend.bt_mut().ldm_process_match_candidate(
5319                    &mut opt_ldm,
5320                    pos,
5321                    $current_len - pos,
5322                    min_match_len,
5323                )
5324            } else {
5325                None
5326            };
5327            candidates.clear();
5328            // SAFETY: same umbrella as `$collect`. Query fields are read
5329            // fresh here (consumed into the call's argument) so they do not
5330            // stay live across the call; the post-call reads below are a
5331            // separate, fresh load of the same stable `nodes[pos]`.
5332            unsafe {
5333                $self.$collect::<$strategy_ty, true>(
5334                    abs_pos,
5335                    current_abs_end,
5336                    profile,
5337                    HcCandidateQuery {
5338                        reps: nodes.get_unchecked(pos).reps,
5339                        lit_len: nodes.get_unchecked(pos).litlen as usize,
5340                        ldm_candidate,
5341                    },
5342                    &mut candidates,
5343                )
5344            };
5345            // Post-call reads of opt[cur]: fresh, born after `$collect`, so
5346            // never part of the cross-call live set (see memory-resident note
5347            // above). `nodes[pos]` is untouched by `$collect`.
5348            let base_reps = unsafe { nodes.get_unchecked(pos).reps };
5349            let base_litlen = unsafe { nodes.get_unchecked(pos).litlen as usize };
5350            if let Some(candidate) = candidates.last() {
5351                let longest_len = candidate.match_len.min($current_len - pos);
5352                if longest_len > sufficient_len
5353                    || pos + longest_len >= HC_OPT_NUM
5354                    || pos + longest_len >= $current_len
5355                {
5356                    let lit_len = base_litlen;
5357                    let off_base = BtMatcher::encode_offset_base_with_reps(
5358                        candidate.offset as u32,
5359                        lit_len,
5360                        base_reps,
5361                    );
5362                    let off_price = profile
5363                        .offset_price_for::<ACCURATE_PRICE, FAVOR_SMALL_OFFSETS>($stats, off_base);
5364                    let ml_price = BtMatcher::cached_match_length_price(
5365                        profile,
5366                        $stats,
5367                        longest_len,
5368                        &mut ml_cache,
5369                        ml_price_stamp,
5370                    );
5371                    let seq_cost = BtMatcher::add_prices(
5372                        ll0_price,
5373                        profile.match_price_from_parts(off_price, ml_price, $stats),
5374                    );
5375                    let forced_price = BtMatcher::add_prices(base_cost, seq_cost);
5376                    let end_pos = (pos + longest_len).min($current_len);
5377                    forced_end = Some(end_pos);
5378                    forced_end_state = Some(HcOptimalNode {
5379                        off: candidate.offset as u32,
5380                        mlen: longest_len as u32,
5381                        litlen: 0,
5382                        reps: base_reps,
5383                    });
5384                    forced_end_price = Some(forced_price);
5385                    break;
5386                }
5387            }
5388            let mut prev_max_len = min_match_len - 1;
5389            for candidate in candidates.iter() {
5390                // Outer loop guards `pos <= frontier_limit` (see the
5391                // `while ... pos <= frontier_limit` condition); the
5392                // subtraction below is therefore safe.
5393                debug_assert!(pos <= frontier_limit);
5394                let max_match_len = candidate
5395                    .match_len
5396                    .min($current_len - pos)
5397                    .min(frontier_limit - pos);
5398                let min_len = min_match_len;
5399                if max_match_len < min_len {
5400                    continue;
5401                }
5402                let start_len = (prev_max_len + 1).max(min_len);
5403                if start_len > max_match_len {
5404                    prev_max_len = prev_max_len.max(max_match_len);
5405                    continue;
5406                }
5407                let max_next = pos + max_match_len;
5408                if max_next > last_pos {
5409                    BtMatcher::reset_opt_nodes(
5410                        &mut nodes,
5411                        &mut node_prices,
5412                        last_pos + 1,
5413                        max_next,
5414                    );
5415                }
5416                let lit_len = base_litlen;
5417                let off_base = BtMatcher::encode_offset_base_with_reps(
5418                    candidate.offset as u32,
5419                    lit_len,
5420                    base_reps,
5421                );
5422                let off_price = profile
5423                    .offset_price_for::<ACCURATE_PRICE, FAVOR_SMALL_OFFSETS>($stats, off_base);
5424                debug_assert!(pos + max_match_len < frontier_buffer_size);
5425                if abort_on_worse_match {
5426                    // btopt (OPT_LEVEL == 0): reverse-iterate with early break —
5427                    // once a longer match stops improving, shorter ones are
5428                    // skipped. Order-dependent, stays scalar.
5429                    for match_len in (start_len..=max_match_len).rev() {
5430                        let next = pos + match_len;
5431                        let ml_price = BtMatcher::cached_match_length_price(
5432                            profile,
5433                            $stats,
5434                            match_len,
5435                            &mut ml_cache,
5436                            ml_price_stamp,
5437                        );
5438                        let seq_cost = BtMatcher::add_prices(
5439                            ll0_price,
5440                            profile.match_price_from_parts(off_price, ml_price, $stats),
5441                        );
5442                        let next_cost = BtMatcher::add_prices(base_cost, seq_cost);
5443                        let node_next_price = unsafe { *node_prices.get_unchecked(next) };
5444                        if next > last_pos || next_cost < node_next_price {
5445                            let slot = unsafe { nodes.get_unchecked_mut(next) };
5446                            *slot = HcOptimalNode {
5447                                off: candidate.offset as u32,
5448                                mlen: match_len as u32,
5449                                litlen: 0,
5450                                reps: base_reps,
5451                            };
5452                            unsafe { *node_prices.get_unchecked_mut(next) = next_cost };
5453                            if next > last_pos {
5454                                last_pos = next;
5455                            }
5456                        } else {
5457                            break;
5458                        }
5459                    }
5460                } else {
5461                    // btultra / btultra2 (OPT_LEVEL >= 2): no abort, each
5462                    // match_len writes a distinct node => order-independent.
5463                    // Dispatch to the per-tier price-set ($priceset is the
5464                    // tier's fn: AVX2 SoA-vector compare for the avx2 wrapper,
5465                    // inline scalar otherwise) — it folds into this wrapper's
5466                    // monomorphisation, so no call ABI / runtime feature check.
5467                    #[allow(unused_unsafe)]
5468                    {
5469                        last_pos = last_pos.max(unsafe {
5470                            $priceset(
5471                                &mut node_prices,
5472                                &mut nodes,
5473                                ml_cache,
5474                                ml_price_stamp,
5475                                profile,
5476                                $stats,
5477                                pos,
5478                                start_len,
5479                                max_match_len,
5480                                ll0_price,
5481                                off_price,
5482                                base_cost,
5483                                candidate.offset as u32,
5484                                base_reps,
5485                                last_pos,
5486                            )
5487                        });
5488                    }
5489                }
5490                prev_max_len = prev_max_len.max(max_match_len);
5491            }
5492
5493            if last_pos + 1 < frontier_buffer_size {
5494                unsafe {
5495                    *node_prices.get_unchecked_mut(last_pos + 1) = u32::MAX;
5496                }
5497            }
5498            pos += 1;
5499        }
5500
5501        if last_pos == 0 {
5502            if $current_len == 0 {
5503                let price = node_prices[0];
5504                return $self.backend.bt_mut().finish_optimal_plan(
5505                    HcOptimalPlanBuffers {
5506                        nodes,
5507                        node_prices,
5508                        candidates,
5509                        store,
5510                        price_arena,
5511                    },
5512                    (price, initial_reps, initial_litlen, 0),
5513                );
5514            }
5515            let lit_price = {
5516                let bt = $self.backend.bt_mut();
5517                BtMatcher::cached_literal_price(
5518                    profile,
5519                    $stats,
5520                    $current[0],
5521                    &mut bt.opt_lit_price_scratch,
5522                    &mut bt.opt_lit_price_generation,
5523                    lit_price_stamp,
5524                )
5525            };
5526            // `initial_litlen` is carried across optimal-plan segments;
5527            // its real bound is the current block length, not
5528            // `current_len`. On i686 (32-bit `usize`) `+ 1` could
5529            // theoretically wrap if the invariant ever broke. Catch
5530            // that explicitly via `checked_add` rather than letting a
5531            // wrapping sum slip into the price lookup.
5532            let next_litlen = initial_litlen
5533                .checked_add(1)
5534                .expect("optimal parser next litlen out of usize range");
5535            let ll_delta = BtMatcher::cached_lit_length_delta_price(
5536                profile,
5537                $stats,
5538                next_litlen,
5539                &mut ll_cache,
5540                ll_price_stamp,
5541            );
5542            let price = BtMatcher::add_price_delta(node_prices[0], lit_price, ll_delta);
5543            return $self.backend.bt_mut().finish_optimal_plan(
5544                HcOptimalPlanBuffers {
5545                    nodes,
5546                    node_prices,
5547                    candidates,
5548                    store,
5549                    price_arena,
5550                },
5551                (price, initial_reps, next_litlen, 1),
5552            );
5553        }
5554
5555        let target_pos = forced_end.unwrap_or(last_pos.min(frontier_limit));
5556        // Price lives in `node_prices`, not the node struct, so carry the
5557        // final-stretch price alongside its node (forced-seed companion or the
5558        // frontier price at `target_pos`).
5559        let (last_stretch, last_stretch_price) = if let Some(forced_state) = forced_end_state {
5560            (forced_state, forced_end_price.expect("forced state has a price"))
5561        } else {
5562            (nodes[target_pos], node_prices[target_pos])
5563        };
5564        if last_stretch_price == u32::MAX {
5565            return $self.backend.bt_mut().finish_optimal_plan(
5566                HcOptimalPlanBuffers {
5567                    nodes,
5568                    node_prices,
5569                    candidates,
5570                    store,
5571                    price_arena,
5572                },
5573                (u32::MAX, initial_reps, initial_litlen, $current_len),
5574            );
5575        }
5576
5577        if last_stretch.mlen == 0 {
5578            return $self.backend.bt_mut().finish_optimal_plan(
5579                HcOptimalPlanBuffers {
5580                    nodes,
5581                    node_prices,
5582                    candidates,
5583                    store,
5584                    price_arena,
5585                },
5586                (
5587                    last_stretch_price,
5588                    last_stretch.reps,
5589                    last_stretch.litlen as usize,
5590                    target_pos.min($current_len),
5591                ),
5592            );
5593        }
5594
5595        let mut cur = target_pos.saturating_sub(last_stretch.mlen as usize);
5596        let end_reps = if last_stretch.litlen == 0 {
5597            let prev_state = nodes[cur];
5598            let (_, reps_after_match) = BtMatcher::encode_offset_with_reps(
5599                last_stretch.off,
5600                prev_state.litlen as usize,
5601                prev_state.reps,
5602            );
5603            reps_after_match
5604        } else {
5605            let tail_literals = last_stretch.litlen as usize;
5606            if cur < tail_literals {
5607                return $self.backend.bt_mut().finish_optimal_plan(
5608                    HcOptimalPlanBuffers {
5609                        nodes,
5610                        node_prices,
5611                        candidates,
5612                        store,
5613                        price_arena,
5614                    },
5615                    (
5616                        last_stretch_price,
5617                        last_stretch.reps,
5618                        tail_literals,
5619                        target_pos.min($current_len),
5620                    ),
5621                );
5622            }
5623            cur -= tail_literals;
5624            last_stretch.reps
5625        };
5626        let store_end = cur + 2;
5627        if store.len() <= store_end {
5628            store.resize(store_end + 1, HcOptimalNode::default());
5629        }
5630        let mut store_start;
5631        let mut stretch_pos = cur;
5632
5633        if last_stretch.litlen > 0 {
5634            store[store_end] = HcOptimalNode {
5635                litlen: last_stretch.litlen,
5636                mlen: 0,
5637                ..HcOptimalNode::default()
5638            };
5639            store_start = store_end.saturating_sub(1);
5640            store[store_start] = last_stretch;
5641        }
5642        store[store_end] = last_stretch;
5643        store_start = store_end;
5644
5645        loop {
5646            let next_stretch = nodes[stretch_pos];
5647            store[store_start].litlen = next_stretch.litlen;
5648            if next_stretch.mlen == 0 {
5649                break;
5650            }
5651            if store_start == 0 {
5652                break;
5653            }
5654            store_start -= 1;
5655            store[store_start] = next_stretch;
5656            // Parser invariant: every emitted stretch is bounded by the
5657            // current block, so `litlen + mlen <= current_len <=
5658            // HC_BLOCKSIZE_MAX (128 KiB)`. The `as usize` widening + raw
5659            // `+` is safe on 32-bit targets — two u32 values do NOT
5660            // automatically fit in `usize` on i686, the block bound is
5661            // what makes this addition safe.
5662            let litlen = next_stretch.litlen as usize;
5663            let mlen = next_stretch.mlen as usize;
5664            debug_assert!(litlen + mlen <= $current_len);
5665            let step = litlen + mlen;
5666            if step == 0 || stretch_pos < step {
5667                break;
5668            }
5669            stretch_pos -= step;
5670        }
5671
5672        let mut tail_literals = initial_litlen;
5673        let mut store_pos = store_start;
5674        while store_pos <= store_end {
5675            let stretch = store[store_pos];
5676            let llen = stretch.litlen as usize;
5677            let mlen = stretch.mlen as usize;
5678            if mlen == 0 {
5679                tail_literals = llen;
5680                store_pos += 1;
5681                continue;
5682            }
5683            $out.push(HcOptimalSequence {
5684                offset: stretch.off,
5685                match_len: mlen as u32,
5686                lit_len: llen as u32,
5687            });
5688            tail_literals = 0;
5689            store_pos += 1;
5690        }
5691        let result = (
5692            last_stretch_price,
5693            end_reps,
5694            if last_stretch.litlen > 0 {
5695                last_stretch.litlen as usize
5696            } else {
5697                tail_literals
5698            },
5699            target_pos.min($current_len),
5700        );
5701        $self.backend.bt_mut().finish_optimal_plan(
5702            HcOptimalPlanBuffers {
5703                nodes,
5704                node_prices,
5705                candidates,
5706                store,
5707                price_arena,
5708            },
5709            result,
5710        )
5711    }};
5712}
5713
5714/// `collect_optimal_candidates_initialized` body parameterized over the per-CPU
5715/// kernel: the `$cpl` path is the kernel's `common_prefix_len_ptr` (used in
5716/// the HC chain walk fallback), and the four method-name substitutions
5717/// (`$bt_update`, `$bt_insert`, `$for_each_rep`, `$hash3`) route to the
5718/// kernel-specific wrappers of the inner helpers. With every helper under
5719/// the same `target_feature` umbrella, the entire per-position pipeline
5720/// (BT-tree fill + rep probing + hash3 probing + BT match collection /
5721/// HC chain walk) inlines without ABI barriers on the level22 hot path.
5722macro_rules! collect_optimal_candidates_initialized_body {
5723    (
5724        $self:expr,
5725        $strategy_ty:ty,
5726        $abs_pos:ident,
5727        $current_abs_end:ident,
5728        $profile:ident,
5729        $query:ident,
5730        $out:ident,
5731        $bt_matchfinder:ident,
5732        $bt_update:ident,
5733        $bt_insert:ident,
5734        $for_each_rep:ident,
5735        $hash3:ident,
5736        $cpl:path $(,)?
5737    ) => {{
5738        // Per-strategy compile-time const: only BtUltra2 drives the
5739        // hash3 short-match table. All other monomorphisations drop
5740        // the entire hash3 lookup block at codegen time. The relaxed
5741        // implication enforces only the direction we depend on:
5742        // if the strategy declares hash3, the table must be live.
5743        // The reverse (`hash3_log != 0` without `USE_HASH3`) is OK —
5744        // a future caller may pre-allocate hash3 storage without
5745        // wiring the BtUltra2 path through.
5746        let use_hash3: bool = <$strategy_ty as super::strategy::Strategy>::USE_HASH3;
5747        debug_assert!(!$self.table.hash_table.is_empty());
5748        debug_assert!($self.table.hash3_log == 0 || !$self.table.hash3_table.is_empty());
5749        debug_assert!(
5750            !use_hash3 || $self.table.hash3_log != 0,
5751            "Strategy::USE_HASH3 = true but runtime hash3_log is 0 — call configure() first",
5752        );
5753        debug_assert!(!$self.table.chain_table.is_empty());
5754        let min_match_len = HC_OPT_MIN_MATCH_LEN;
5755        let reps = $query.reps;
5756        let lit_len = $query.lit_len;
5757        let ldm_candidate = $query.ldm_candidate;
5758        $out.clear();
5759        if $abs_pos < $self.table.skip_insert_until_abs {
5760            if let Some(ldm) = ldm_candidate {
5761                let mut best_len_for_skip = 0usize;
5762                let _ = super::bt::BtMatcher::push_candidate_ladder(
5763                    $out,
5764                    &mut best_len_for_skip,
5765                    ldm,
5766                    min_match_len,
5767                );
5768            }
5769            return;
5770        }
5771        if $bt_matchfinder {
5772            // SAFETY: caller is in the same target_feature umbrella as
5773            // `$bt_update`; the runtime kernel detector already gated entry.
5774            unsafe { $self.table.$bt_update($abs_pos, $current_abs_end) };
5775        }
5776        let current_idx = $abs_pos - $self.table.history_abs_start;
5777        if current_idx + 4 > $self.table.live_history().len() {
5778            if let Some(ldm) = ldm_candidate {
5779                let mut best_len_for_skip = 0usize;
5780                let _ = super::bt::BtMatcher::push_candidate_ladder(
5781                    $out,
5782                    &mut best_len_for_skip,
5783                    ldm,
5784                    min_match_len,
5785                );
5786            }
5787            return;
5788        }
5789        let mut best_len_for_skip = 0usize;
5790        let mut skip_further_match_search = false;
5791        let mut rep_len_candidate_found = false;
5792        // SAFETY: same umbrella; closure capture is monomorphized per call.
5793        unsafe {
5794            $self.hc.$for_each_rep(
5795                &$self.table,
5796                $abs_pos,
5797                lit_len,
5798                reps,
5799                $current_abs_end,
5800                min_match_len,
5801                |rep| {
5802                    if rep.match_len >= min_match_len {
5803                        rep_len_candidate_found = true;
5804                    }
5805                    let _ = super::bt::BtMatcher::push_candidate_ladder(
5806                        $out,
5807                        &mut best_len_for_skip,
5808                        rep,
5809                        min_match_len,
5810                    );
5811                    if rep.match_len > $profile.sufficient_match_len {
5812                        skip_further_match_search = true;
5813                    }
5814                    // `for_each_repcode_candidate_with_reps` caps
5815                    // `rep.match_len` at the per-call `tail_limit =
5816                    // current_abs_end - abs_pos`, so `abs_pos +
5817                    // rep.match_len <= current_abs_end`. The raw sum
5818                    // therefore stays in `usize` on every supported
5819                    // target.
5820                    if $abs_pos + rep.match_len >= $current_abs_end {
5821                        skip_further_match_search = true;
5822                    }
5823                },
5824            )
5825        };
5826        // Hash3 lookup runs only when the strategy enables it. The
5827        // `use_hash3` binding above is a per-monomorphisation const,
5828        // so non-BtUltra2 instances drop this entire block.
5829        if use_hash3 && !skip_further_match_search && best_len_for_skip < min_match_len {
5830            $self.table.update_hash3_until($abs_pos);
5831            // SAFETY: same umbrella for hash3_candidate.
5832            if let Some(h3) = unsafe {
5833                $self
5834                    .table
5835                    .$hash3($abs_pos, $current_abs_end, min_match_len)
5836            } {
5837                let _ = super::bt::BtMatcher::push_candidate_ladder(
5838                    $out,
5839                    &mut best_len_for_skip,
5840                    h3,
5841                    min_match_len,
5842                );
5843                if !rep_len_candidate_found
5844                    && (h3.match_len > $profile.sufficient_match_len
5845                        || $abs_pos + h3.match_len >= $current_abs_end)
5846                {
5847                    $self.table.skip_insert_until_abs = $abs_pos + 1;
5848                    skip_further_match_search = true;
5849                }
5850            }
5851        }
5852        if !skip_further_match_search && $bt_matchfinder {
5853            // SAFETY: same umbrella for bt_insert_and_collect_matches.
5854            unsafe {
5855                $self.table.$bt_insert(
5856                    $abs_pos,
5857                    $current_abs_end,
5858                    $profile,
5859                    min_match_len,
5860                    &mut best_len_for_skip,
5861                    $out,
5862                )
5863            };
5864        } else if !skip_further_match_search {
5865            $self.table.insert_position($abs_pos);
5866            let max_chain_depth = $profile.max_chain_depth.min($self.hc.search_depth);
5867            let concat = $self.table.live_history();
5868            // Raw `+ 9` is safe here — see `bt_insert_step_no_rebase_body!`
5869            // for the full discussion of the upstream `STREAM_ABS_HEADROOM`
5870            // cap in `MatchTable::add_data`.
5871            let mut match_end_abs = $abs_pos + 9;
5872            if max_chain_depth > 0 {
5873                for (visited, candidate_abs) in $self
5874                    .hc
5875                    .chain_candidates(&$self.table, $abs_pos)
5876                    .into_iter()
5877                    .enumerate()
5878                {
5879                    if visited >= max_chain_depth {
5880                        break;
5881                    }
5882                    if candidate_abs == usize::MAX {
5883                        break;
5884                    }
5885                    if candidate_abs < $self.table.window_low_abs_for_target($abs_pos)
5886                        || candidate_abs >= $abs_pos
5887                    {
5888                        continue;
5889                    }
5890                    let candidate_idx = candidate_abs - $self.table.history_abs_start;
5891                    debug_assert!(
5892                        $abs_pos <= $current_abs_end,
5893                        "HC chain walker called past current block end"
5894                    );
5895                    let tail_limit = $current_abs_end - $abs_pos;
5896                    let base = concat.as_ptr();
5897                    // SAFETY: history-relative indices; `tail_limit` bounds
5898                    // the scan within `concat`. `$cpl` is the kernel-specific
5899                    // common_prefix_len_ptr — call inlines because the
5900                    // surrounding wrapper carries the same target_feature.
5901                    let match_len =
5902                        unsafe { $cpl(base.add(candidate_idx), base.add(current_idx), tail_limit) };
5903                    if match_len < min_match_len {
5904                        continue;
5905                    }
5906                    let offset = $abs_pos - candidate_abs;
5907                    if super::bt::BtMatcher::push_candidate_ladder(
5908                        $out,
5909                        &mut best_len_for_skip,
5910                        MatchCandidate {
5911                            start: $abs_pos,
5912                            offset,
5913                            match_len,
5914                        },
5915                        min_match_len,
5916                    ) {
5917                        let candidate_end = candidate_abs + match_len;
5918                        if candidate_end > match_end_abs {
5919                            match_end_abs = candidate_end;
5920                        }
5921                    }
5922                    if match_len > HC_OPT_NUM || $abs_pos + match_len >= $current_abs_end {
5923                        break;
5924                    }
5925                }
5926            }
5927            // `match_end_abs` initialized to `abs_pos + 9`; monotonic
5928            // updates only ever extend it, so `match_end_abs - 8 >= 1`.
5929            $self.table.skip_insert_until_abs =
5930                $self.table.skip_insert_until_abs.max(match_end_abs - 8);
5931        }
5932        if let Some(ldm) = ldm_candidate {
5933            let _ = super::bt::BtMatcher::push_candidate_ladder(
5934                $out,
5935                &mut best_len_for_skip,
5936                ldm,
5937                min_match_len,
5938            );
5939        }
5940    }};
5941}
5942
5943/// `hash3_candidate` body parameterized over the per-CPU
5944/// `common_prefix_len_ptr` symbol. The hash3 probe checks one candidate per
5945/// position when invoked, so the per-call ABI savings compound across the
5946/// segment. Crate-private (see `bt_insert_step_no_rebase_body!`).
5947macro_rules! hash3_candidate_body {
5948    (
5949        $table:expr,
5950        $abs_pos:ident,
5951        $current_abs_end:ident,
5952        $min_match_len:ident,
5953        $cpl:path $(,)?
5954    ) => {{
5955        if $table.hash3_log == 0 {
5956            return None;
5957        }
5958        let idx = $abs_pos.checked_sub($table.history_abs_start)?;
5959        let concat = $table.live_history();
5960        if idx + 4 > concat.len() {
5961            return None;
5962        }
5963        let hash3 = $crate::encoding::match_table::storage::MatchTable::hash_position_at(
5964            concat,
5965            idx,
5966            $table.hash3_log,
5967            3,
5968        );
5969        let entry = $table
5970            .hash3_table
5971            .get(hash3)
5972            .copied()
5973            .unwrap_or($crate::encoding::match_table::storage::HC_EMPTY);
5974        let candidate_abs =
5975            $crate::encoding::match_table::storage::MatchTable::stored_abs_position_fast(
5976                entry,
5977                $table.position_base,
5978                $table.index_shift,
5979            )?;
5980        if candidate_abs < $table.history_abs_start || candidate_abs >= $abs_pos {
5981            return None;
5982        }
5983        let offset = $abs_pos - candidate_abs;
5984        if offset >= $crate::encoding::bt::HC3_MAX_OFFSET {
5985            return None;
5986        }
5987        let candidate_idx = candidate_abs - $table.history_abs_start;
5988        let tail_limit = $current_abs_end.saturating_sub($abs_pos);
5989        let base = concat.as_ptr();
5990        // SAFETY: candidate/idx are within history range; tail_limit
5991        // bounds the scan within `concat`.
5992        let match_len = unsafe { $cpl(base.add(candidate_idx), base.add(idx), tail_limit) };
5993        (match_len >= $min_match_len).then_some($crate::encoding::opt::types::MatchCandidate {
5994            start: $abs_pos,
5995            offset,
5996            match_len,
5997        })
5998    }};
5999}
6000pub(crate) use hash3_candidate_body;
6001
6002/// `for_each_repcode_candidate_with_reps` body parameterized over the per-CPU
6003/// `common_prefix_len_ptr` symbol so the per-rep prefix probe inlines under
6004/// the wrapper's `target_feature` umbrella instead of crossing the ABI
6005/// boundary through the dispatcher. Three rep probes per encoded position →
6006/// thousands per segment, so the per-call barrier was non-trivial.
6007///
6008/// The callback `f` runs in the wrapper's umbrella context too, so closures
6009/// that capture mutable state still work (FnMut). Crate-private
6010/// (see `bt_insert_step_no_rebase_body!`).
6011macro_rules! for_each_repcode_candidate_body {
6012    (
6013        $table:expr,
6014        $abs_pos:ident,
6015        $lit_len:ident,
6016        $reps:ident,
6017        $current_abs_end:ident,
6018        $min_match_len:ident,
6019        $f:ident,
6020        $cpl:path $(,)?
6021    ) => {{
6022        let rep_offsets: [Option<usize>; 3] = if $lit_len == 0 {
6023            [
6024                Some($reps[1] as usize),
6025                Some($reps[2] as usize),
6026                ($reps[0] > 1).then_some(($reps[0] - 1) as usize),
6027            ]
6028        } else {
6029            [
6030                Some($reps[0] as usize),
6031                Some($reps[1] as usize),
6032                Some($reps[2] as usize),
6033            ]
6034        };
6035        let concat = $table.live_history();
6036        let current_idx = $abs_pos - $table.history_abs_start;
6037        if current_idx + 4 > concat.len() {
6038            return;
6039        }
6040        let tail_limit = $current_abs_end.saturating_sub($abs_pos);
6041        let base = concat.as_ptr();
6042        let concat_len = concat.len();
6043        for rep in rep_offsets.into_iter().flatten() {
6044            if rep == 0 || rep > $abs_pos {
6045                continue;
6046            }
6047            let candidate_pos = $abs_pos - rep;
6048            if candidate_pos < $table.history_abs_start {
6049                continue;
6050            }
6051            let candidate_idx = candidate_pos - $table.history_abs_start;
6052            // Upstream zstd `ZSTD_readMINMATCH` gate (zstd_opt.c:657-674): a
6053            // 4-byte (3-byte when min_match_len == 3) equality probe
6054            // before the full prefix scan. Equivalent filtering — a
6055            // mismatch here means `match_len < min_match_len`, which
6056            // the post-scan check rejects anyway — but it skips the
6057            // prefix-kernel call for the common no-match case (rep
6058            // offsets rarely hit on low-redundancy input).
6059            //
6060            // SAFETY: `current_idx + 4 <= concat_len` (early return
6061            // above) and `candidate_idx < current_idx` (rep >= 1), so
6062            // both 4-byte reads stay inside `concat`.
6063            let gate_matches = unsafe {
6064                let cand = base.add(candidate_idx).cast::<u32>().read_unaligned();
6065                let cur = base.add(current_idx).cast::<u32>().read_unaligned();
6066                if $min_match_len == 3 {
6067                    // Compare the low-address 3 bytes regardless of
6068                    // endianness: byte-shift on LE, mask via to_le.
6069                    (cand.to_le() & 0x00FF_FFFF) == (cur.to_le() & 0x00FF_FFFF)
6070                } else {
6071                    cand == cur
6072                }
6073            };
6074            if !gate_matches {
6075                continue;
6076            }
6077            // SAFETY: `candidate_idx ≤ current_idx < concat_len` (since
6078            // candidate_pos ≤ abs_pos and we early-returned on
6079            // `current_idx + 4 > concat_len`). `max` clamps to the shorter
6080            // remaining run so neither pointer overruns `concat`.
6081            let max = (concat_len - candidate_idx)
6082                .min(concat_len - current_idx)
6083                .min(tail_limit);
6084            let match_len = unsafe { $cpl(base.add(candidate_idx), base.add(current_idx), max) };
6085            if match_len < $min_match_len {
6086                continue;
6087            }
6088            $f(MatchCandidate {
6089                start: $abs_pos,
6090                offset: rep,
6091                match_len,
6092            });
6093        }
6094    }};
6095}
6096pub(crate) use for_each_repcode_candidate_body;
6097
6098/// `bt_insert_and_collect_matches` body parameterized over the per-CPU
6099/// `count_match_from_indices` symbol. Same shape as
6100/// [`bt_insert_step_no_rebase_body`] — picks up the matching kernel through
6101/// `$cmf` so the per-iteration vector probe inlines under the wrapper's
6102/// `target_feature` umbrella. Returns nothing (matches the original method).
6103/// Crate-private (see `bt_insert_step_no_rebase_body!`).
6104macro_rules! bt_insert_and_collect_matches_body {
6105    (
6106        $table:expr,
6107        $search_depth:expr,
6108        $abs_pos:ident,
6109        $current_abs_end:ident,
6110        $profile:ident,
6111        $min_match_len:ident,
6112        $best_len_for_skip:ident,
6113        $out:ident,
6114        $cmf:path $(,)?
6115    ) => {{
6116        let idx = $abs_pos - $table.history_abs_start;
6117        // Borrowed-aware live region (owned: `history[history_start..]`;
6118        // borrowed: the in-place input `[0, block_end)`). Reborrow-then-raw-ptr
6119        // so the slice holds NO borrow and coexists with the `&mut $table`
6120        // binary-tree writes below. Owned is byte-identical (same bytes).
6121        let concat: &[u8] = unsafe {
6122            let lh = $table.live_history();
6123            core::slice::from_raw_parts(lh.as_ptr(), lh.len())
6124        };
6125        if idx + 8 > concat.len() {
6126            return;
6127        }
6128        debug_assert!(
6129            $abs_pos <= $current_abs_end,
6130            "BT collect called past current block end"
6131        );
6132        let tail_limit = $current_abs_end - $abs_pos;
6133        let hash = $crate::encoding::match_table::storage::MatchTable::hash_position_at(
6134            concat,
6135            idx,
6136            $table.hash_log,
6137            $table.search_mls,
6138        );
6139        // Prefetch the hash bucket now. For the large L16+ hash table over
6140        // high-entropy input the bucket is L3/DRAM-cold, and unlike upstream's
6141        // monolithic ZSTD_btGetAllMatches (which overlaps this miss with its
6142        // inline rep/hash3 prologue) the read+write of `hash_table[hash]`
6143        // below is reached with nothing to hide it behind — it stalled a large
6144        // share of this function's cycles. Issuing the hint here lets the miss
6145        // overlap the address setup that follows.
6146        #[cfg(all(
6147            target_feature = "sse",
6148            any(target_arch = "x86", target_arch = "x86_64")
6149        ))]
6150        {
6151            #[cfg(target_arch = "x86")]
6152            use core::arch::x86::{_MM_HINT_T0, _mm_prefetch};
6153            #[cfg(target_arch = "x86_64")]
6154            use core::arch::x86_64::{_MM_HINT_T0, _mm_prefetch};
6155            // SAFETY: prefetch is a hint that never faults; `hash` indexes
6156            // `hash_table` directly below, so it is in bounds.
6157            unsafe {
6158                _mm_prefetch($table.hash_table.as_ptr().add(hash).cast(), _MM_HINT_T0);
6159            }
6160            // Prefetch the NEXT position's bucket too. The optimal-parser DP
6161            // advances one position per iteration, so this miss is issued a
6162            // full BT walk plus the next iteration's pre-collect work ahead of
6163            // the collect that will read it — far more lead than the same-call
6164            // hint above, enough to hide the full DRAM latency.
6165            if idx + 1 + 8 <= concat.len() {
6166                let hash_next =
6167                    $crate::encoding::match_table::storage::MatchTable::hash_position_at(
6168                        concat,
6169                        idx + 1,
6170                        $table.hash_log,
6171                        $table.search_mls,
6172                    );
6173                // SAFETY: prefetch never faults; an out-of-range index is a
6174                // harmless no-op hint.
6175                unsafe {
6176                    _mm_prefetch(
6177                        $table.hash_table.as_ptr().add(hash_next).cast(),
6178                        _MM_HINT_T0,
6179                    );
6180                }
6181            }
6182        }
6183        let Some(relative_pos) = $table.relative_position($abs_pos) else {
6184            return;
6185        };
6186        let stored = relative_pos + 1;
6187        let bt_mask = $table.bt_mask();
6188        // Hoist the BT pointer-pair table's base out of `self` once: every
6189        // access below is `chain_table[computed_index]` through `&mut self`,
6190        // which the optimizer cannot prove loop-invariant, so it reloads the
6191        // Vec's (ptr,len) from the struct AND bounds-checks on every tree
6192        // step (the upstream zstd walks a raw `U32* btable`, zstd_opt.c). The raw
6193        // base carries no borrow, so the `&self` helper calls in the loop
6194        // (`bt_pair_index_for_abs`, `window_low_abs_for_target`,
6195        // `relative_position`) coexist — they read other fields, never
6196        // `chain_table`. Indices are in bounds by the BT invariants:
6197        // `bt_pair_index_for_abs` returns `2*(abs & bt_mask) (+1)` ≤
6198        // `chain_table.len()-1`, and the slots only ever hold those values.
6199        let chain_ptr = $table.chain_table.as_mut_ptr();
6200        debug_assert_eq!($table.chain_table.len(), 2 << $table.bt_log());
6201        // See `bt_insert_step_no_rebase_body!`: saturating is needed for the
6202        // first BT walk of a fresh frame where `abs_pos < bt_mask`.
6203        let bt_low = $abs_pos.saturating_sub(bt_mask);
6204        let window_low = $table.window_low_abs_for_target($abs_pos);
6205        // Upstream zstd-style window bound in stored space so the BT-walk loop
6206        // condition rejects out-of-window / HC_EMPTY candidates WITHOUT
6207        // decoding them (mirrors upstream `while ... matchIndex >= matchLow`):
6208        // one range check on `match_stored` instead of decode-then-break,
6209        // dropping the wasted candidate_abs decode on every walk's terminating
6210        // step. candidate_abs(s) = (position_base + s - 1) - index_shift =
6211        // base + s (wrapping); in-window ⟺ candidate_abs - window_low <
6212        // abs_pos - window_low ⟺ s.wrapping_add(win_off) < win_range.
6213        // HC_EMPTY (s = 0) maps to base = (lowest representable abs) - 1 <
6214        // window_low, so it falls out of range and ends the walk.
6215        let win_off = $table
6216            .position_base
6217            .wrapping_sub(1)
6218            .wrapping_sub($table.index_shift)
6219            .wrapping_sub(window_low);
6220        let win_range = $abs_pos - window_low;
6221        // Raw `+ 9` is safe here — see `bt_insert_step_no_rebase_body!`
6222        // for the full discussion of the upstream `STREAM_ABS_HEADROOM`
6223        // cap in `MatchTable::add_data`.
6224        let mut match_end_abs = $abs_pos + 9;
6225        let mut compares_left = $profile.max_chain_depth.min($search_depth);
6226        let mut common_length_smaller = 0usize;
6227        let mut common_length_larger = 0usize;
6228        let pair_idx = $table.bt_pair_index_for_abs($abs_pos);
6229        let mut smaller_slot = pair_idx;
6230        let mut larger_slot = pair_idx + 1;
6231        let mut match_stored = $table.hash_table[hash];
6232        $table.hash_table[hash] = stored;
6233        // Upstream zstd semantics: `bestLength` starts at `lengthToBeat - 1`; rep/hash3
6234        // probing may raise it; BT then only reports strictly longer matches.
6235        // `min_match_len >= HC_FORMAT_MINMATCH (3)` by configure invariant,
6236        // so `min_match_len - 1 >= 2` cannot underflow.
6237        debug_assert!(
6238            $min_match_len >= $crate::encoding::cost_model::HC_FORMAT_MINMATCH,
6239            "min_match_len must be at least HC_FORMAT_MINMATCH"
6240        );
6241        let mut best_len = (*$best_len_for_skip).max($min_match_len - 1);
6242
6243        // Upstream zstd-form loop condition: the stored-space window range check
6244        // (`s.wrapping_add(win_off) < win_range`) rejects out-of-window and
6245        // HC_EMPTY candidates here, so the terminating step never enters the
6246        // body — no wasted candidate_abs decode, matching upstream's
6247        // `while ... matchIndex >= matchLow`.
6248        while compares_left > 0 && (match_stored as usize).wrapping_add(win_off) < win_range {
6249            compares_left -= 1;
6250            // The condition proved this candidate is in `[window_low,
6251            // abs_pos)`, so `match_stored >= 1` (HC_EMPTY is out of range) and
6252            // the `- 1` cannot underflow; candidate_abs == base + match_stored.
6253            let candidate_abs = ($table.position_base + (match_stored as usize - 1))
6254                .wrapping_sub($table.index_shift);
6255
6256            let next_pair_idx = $table.bt_pair_index_for_abs(candidate_abs);
6257            // SAFETY: `next_pair_idx (+1)` = `2*(candidate_abs & bt_mask) (+1)`
6258            // ≤ `chain_table.len()-1`; `chain_ptr` is the hoisted live base,
6259            // table not realloc'd during the walk.
6260            let next_smaller = unsafe { *chain_ptr.add(next_pair_idx) };
6261            let next_larger = unsafe { *chain_ptr.add(next_pair_idx + 1) };
6262            let seed_len = common_length_smaller.min(common_length_larger);
6263            let candidate_idx = candidate_abs - $table.history_abs_start;
6264            // SAFETY: BT walk invariant — `candidate_idx + tail_limit ≤
6265            // concat.len()`.
6266            let match_len = unsafe { $cmf(concat, idx, candidate_idx, tail_limit, seed_len) };
6267
6268            if match_len > best_len {
6269                let offset = $abs_pos - candidate_abs;
6270                let accepted = $crate::encoding::bt::BtMatcher::push_candidate_ladder(
6271                    $out,
6272                    $best_len_for_skip,
6273                    $crate::encoding::opt::types::MatchCandidate {
6274                        start: $abs_pos,
6275                        offset,
6276                        match_len,
6277                    },
6278                    $min_match_len,
6279                );
6280                if accepted {
6281                    best_len = match_len;
6282                    // BT walker invariants: `candidate_abs < abs_pos`
6283                    // and `match_len <= tail_limit = current_abs_end -
6284                    // abs_pos`. So `candidate_abs + match_len <
6285                    // abs_pos + tail_limit = current_abs_end`, which
6286                    // fits in `usize` on every supported target (32-bit
6287                    // i686 included) — the addition stays within the
6288                    // current block.
6289                    let candidate_end = candidate_abs + match_len;
6290                    if candidate_end > match_end_abs {
6291                        match_end_abs = candidate_end;
6292                    }
6293                    if match_len >= tail_limit
6294                        || match_len > $crate::encoding::cost_model::HC_OPT_NUM
6295                    {
6296                        break;
6297                    }
6298                }
6299            }
6300
6301            if match_len >= tail_limit {
6302                break;
6303            }
6304
6305            let candidate_next = candidate_idx + match_len;
6306            let current_next = idx + match_len;
6307            // SAFETY: first-differing positions after a match_len-long prefix;
6308            // match_len < tail_limit (break above) + BT-walk bound
6309            // idx/candidate_idx + tail_limit <= concat.len() keep both in range.
6310            if unsafe {
6311                *concat.get_unchecked(candidate_next) < *concat.get_unchecked(current_next)
6312            } {
6313                // SAFETY: `smaller_slot` holds a valid pair index (init
6314                // `pair_idx`, updated to `next_pair_idx + 1`); the `usize::MAX`
6315                // sentinel is set only just before `break`, never written here.
6316                unsafe { *chain_ptr.add(smaller_slot) = match_stored };
6317                common_length_smaller = match_len;
6318                if candidate_abs <= bt_low {
6319                    smaller_slot = usize::MAX;
6320                    break;
6321                }
6322                smaller_slot = next_pair_idx + 1;
6323                match_stored = next_larger;
6324            } else {
6325                // SAFETY: as above for `larger_slot`.
6326                unsafe { *chain_ptr.add(larger_slot) = match_stored };
6327                common_length_larger = match_len;
6328                if candidate_abs <= bt_low {
6329                    larger_slot = usize::MAX;
6330                    break;
6331                }
6332                larger_slot = next_pair_idx;
6333                match_stored = next_smaller;
6334            }
6335        }
6336
6337        // SAFETY: both slots, when not the `usize::MAX` sentinel, hold valid
6338        // pair indices into the hoisted `chain_table` base.
6339        if smaller_slot != usize::MAX {
6340            unsafe {
6341                *chain_ptr.add(smaller_slot) = $crate::encoding::match_table::storage::HC_EMPTY
6342            };
6343        }
6344        if larger_slot != usize::MAX {
6345            unsafe {
6346                *chain_ptr.add(larger_slot) = $crate::encoding::match_table::storage::HC_EMPTY
6347            };
6348        }
6349
6350        // Dict dual-probe (upstream zstd `ZSTD_dictMatchState`, zstd_opt.c:777-813):
6351        // after the live tree, descend the immutable dictionary BINARY TREE
6352        // (built in `prime_dms_bt`) with its OWN compare budget and push any
6353        // dict match longer than the live best into the ladder. The DUBT
6354        // descent reaches the longest dict match efficiently (a hash-chain
6355        // surfaced only the few same-bucket candidates and left most of the
6356        // dict savings unrealised at btlazy2 / btopt). Dict positions are
6357        // dictionary-relative concat indices in `[0, region)`, pinned at the
6358        // front of history, so a dict candidate at `dict_idx` sits at offset
6359        // `idx - dict_idx` (no upstream zstd `dmsIndexDelta`). The optimal parser
6360        // prices these (its DP lookahead values the repcode chain a dict match
6361        // seeds); the greedy/lazy parser commits the longest.
6362        if let Some(dms) = $table.dms.table() {
6363            let region = $table.dms.region_len();
6364            let dh = $crate::encoding::match_table::storage::MatchTable::hash_position_at(
6365                concat,
6366                idx,
6367                dms.hash_log,
6368                dms.mls,
6369            );
6370            let mut dcur = dms.hash_table[dh];
6371            // DUBT seed lengths: bytes already known common on each side, so
6372            // `$cmf` resumes from there (upstream zstd commonLengthSmaller/Larger).
6373            let mut common_smaller = 0usize;
6374            let mut common_larger = 0usize;
6375            let mut dms_compares = $profile.max_chain_depth.min($search_depth);
6376            while dms_compares > 0 && dcur != $crate::encoding::match_table::storage::HC_EMPTY {
6377                let dict_idx = (dcur - 1) as usize;
6378                // The dict tree holds only dict positions (`< region <= idx`).
6379                if dict_idx >= region || dict_idx >= idx {
6380                    break;
6381                }
6382                dms_compares -= 1;
6383                let pair = 2 * dict_idx;
6384                let seed = common_smaller.min(common_larger);
6385                // SAFETY: `dict_idx < idx` and `idx + tail_limit <=
6386                // concat.len()` (checked at entry); same umbrella as the live
6387                // walk's `$cmf`. `seed <= prior match_len <= tail_limit`.
6388                let match_len = unsafe { $cmf(concat, idx, dict_idx, tail_limit, seed) };
6389                if match_len > best_len {
6390                    let offset = idx - dict_idx;
6391                    let accepted = $crate::encoding::bt::BtMatcher::push_candidate_ladder(
6392                        $out,
6393                        $best_len_for_skip,
6394                        $crate::encoding::opt::types::MatchCandidate {
6395                            start: $abs_pos,
6396                            offset,
6397                            match_len,
6398                        },
6399                        $min_match_len,
6400                    );
6401                    if accepted {
6402                        best_len = match_len;
6403                        let candidate_end = $abs_pos + match_len;
6404                        if candidate_end > match_end_abs {
6405                            match_end_abs = candidate_end;
6406                        }
6407                        if match_len > $crate::encoding::cost_model::HC_OPT_NUM {
6408                            break;
6409                        }
6410                    }
6411                }
6412                // Match reached the block tail: can't order the pair (upstream zstd
6413                // `ip+matchLength == iLimit`), and indexing `concat[idx +
6414                // match_len]` below would step past the searchable region.
6415                if match_len >= tail_limit {
6416                    break;
6417                }
6418                // Descend the DUBT (upstream zstd zstd_opt.c:806-811): dict candidate
6419                // smaller than input → its larger child is closer to `idx`.
6420                if concat[dict_idx + match_len] < concat[idx + match_len] {
6421                    common_smaller = match_len;
6422                    dcur = dms.chain_table[pair + 1];
6423                } else {
6424                    common_larger = match_len;
6425                    dcur = dms.chain_table[pair];
6426                }
6427            }
6428        }
6429
6430        // `match_end_abs >= abs_pos + 9 >= 9` (initialized and monotonic),
6431        // so `match_end_abs - 8 >= 1` cannot underflow.
6432        $table.skip_insert_until_abs = match_end_abs - 8;
6433    }};
6434}
6435pub(crate) use bt_insert_and_collect_matches_body;
6436
6437impl HcMatchGenerator {
6438    /// Heap bytes this generator owns: the shared match table plus the BT
6439    /// backend's optimal-parser / LDM scratch (the HC knobs are inline).
6440    fn heap_size(&self) -> usize {
6441        self.table.heap_size() + self.backend.heap_size()
6442    }
6443
6444    fn should_run_btultra2_seed_pass<S: super::strategy::Strategy>(
6445        &self,
6446        current_len: usize,
6447    ) -> bool {
6448        // The in-block two-pass dynamic-stats seed (`initStats_ultra`)
6449        // is btultra2-only. `TWO_PASS_SEED` is `false` for every other
6450        // strategy — including btultra, which now shares the hash3
6451        // short-match probe but stays single-pass — so the seed call and
6452        // its body drop at codegen time for all non-btultra2 kernels.
6453        if !S::TWO_PASS_SEED {
6454            return false;
6455        }
6456        let HcBackend::Bt(bt) = &self.backend else {
6457            return false;
6458        };
6459        bt.opt_state.lit_length_sum == 0
6460            && bt.opt_state.dictionary_seed.is_none()
6461            && !self.table.dictionary_primed_for_frame
6462            && bt.ldm_sequences.is_empty()
6463            && self.table.window_size == current_len
6464            && self.table.history_abs_start == 0
6465            && self.table.chunk_lens.len() == 1
6466            && current_len > HC_PREDEF_THRESHOLD
6467    }
6468
6469    fn new(max_window_size: usize) -> Self {
6470        Self {
6471            table: super::match_table::storage::MatchTable::new(max_window_size),
6472            hc: super::hc::HcMatcher::new(2, HC_SEARCH_DEPTH, HC_TARGET_LEN),
6473            // Default to the zero-sized HC backend; `configure()` swaps
6474            // in a `BtMatcher` only when an optimal strategy lands.
6475            backend: HcBackend::Hc,
6476            // Lazy is the per-construct default — every production
6477            // caller calls `configure()` before the first encode and
6478            // overwrites this. Tests that drive `HcMatchGenerator`
6479            // without calling `configure()` end up in the
6480            // `start_matching_lazy` arm of the test dispatcher, which
6481            // matches the previous default behaviour.
6482            strategy_tag: super::strategy::StrategyTag::Lazy,
6483        }
6484    }
6485
6486    fn configure(&mut self, config: HcConfig, tag: super::strategy::StrategyTag, window_log: u8) {
6487        use super::strategy::StrategyTag;
6488        // Mirror the driver-resolved strategy tag so the
6489        // `#[cfg(test)] start_matching` dispatcher can route
6490        // BtOpt / BtUltra / BtUltra2 to distinct monomorphisations.
6491        self.strategy_tag = tag;
6492        let is_btultra2 = tag == StrategyTag::BtUltra2;
6493        let uses_bt = matches!(
6494            tag,
6495            StrategyTag::Btlazy2
6496                | StrategyTag::BtOpt
6497                | StrategyTag::BtUltra
6498                | StrategyTag::BtUltra2
6499        );
6500        // btultra and btultra2 both run the mls=3 hash3 short-match probe
6501        // (clevels.h minMatch 3). The `is_btultra2` flag below stays
6502        // exclusive to btultra2 because it tweaks the BT rebase boundary,
6503        // not match finding.
6504        let wants_hash3 = matches!(tag, StrategyTag::BtUltra | StrategyTag::BtUltra2);
6505        let next_hash3_log = if wants_hash3 {
6506            HC3_HASH_LOG.min(window_log as usize)
6507        } else {
6508            0
6509        };
6510        let resize = self.table.hash_log != config.hash_log
6511            || self.table.chain_log != config.chain_log
6512            || self.table.hash3_log != next_hash3_log;
6513        // Capture the layout flip BEFORE `uses_bt` is overwritten below — it
6514        // feeds the dms invalidation (the dms is keyed by layout too).
6515        let uses_bt_changed = self.table.uses_bt != uses_bt;
6516        self.table.hash_log = config.hash_log;
6517        self.table.chain_log = config.chain_log;
6518        self.table.hash3_log = next_hash3_log;
6519        self.hc.search_depth = if uses_bt {
6520            config.search_depth
6521        } else {
6522            config.search_depth.min(MAX_HC_SEARCH_DEPTH)
6523        };
6524        self.hc.target_len = config.target_len;
6525        // Mirror strategy-derived flags + HC search depth onto MatchTable
6526        // so the BT walker and rebase machinery can read them directly
6527        // without dispatching back through HcMatchGenerator.
6528        self.table.search_depth = self.hc.search_depth;
6529        self.table.is_btultra2 = is_btultra2;
6530        self.table.uses_bt = uses_bt;
6531        // BT finder hash width, upstream zstd `mls = BOUNDED(4, cParams.minMatch, 6)`,
6532        // carried explicitly in the level config so a `target_length` override
6533        // cannot silently flip the finder between 5- and 4-byte hashing. Only
6534        // the BT body reads it; HC/lazy levels leave it at 4. clevels.h
6535        // (srcSize > 256 KiB tier): btlazy2 L13-15 + btopt L16 are minMatch=5,
6536        // btopt L17 is minMatch=4, btultra/btultra2 are minMatch=3 (4-byte main
6537        // hash + the hash3 short-match probe).
6538        // The cached dms is keyed by the full (region, layout, mls, hash_log)
6539        // shape that `build_dms!` validates on the normal prime path, but the
6540        // reborrow fast path in `MatchTable::reset` reuses it on `dms.is_primed()`
6541        // ALONE. A reused-compressor level switch can change the search mls (e.g.
6542        // btlazy2 -> lazy), the table geometry (hash_log / chain_log / hash3,
6543        // captured in `resize`), OR the HC<->BT layout (`uses_bt_changed`)
6544        // independently of each other, and any of them leaves the dms hashed for
6545        // a different shape. Invalidate on ANY so the next dict frame re-primes at
6546        // the new shape (configure runs before reset) instead of probing a
6547        // mismatched dms and silently degrading match quality. Over-invalidation
6548        // only costs a re-prime, which a real shape change needs anyway.
6549        let mls_changed = self.table.search_mls != config.search_mls;
6550        if resize || mls_changed || uses_bt_changed {
6551            self.table.dms.invalidate();
6552        }
6553        self.table.search_mls = config.search_mls;
6554        // Stage D: promote the backend discriminator. HC modes drop the
6555        // BT scratch buffers entirely; switching back into a BT mode
6556        // allocates a fresh `BtMatcher` on demand.
6557        match (&self.backend, self.table.uses_bt) {
6558            (HcBackend::Hc, true) => {
6559                self.backend = HcBackend::Bt(alloc::boxed::Box::new(super::bt::BtMatcher::new()));
6560            }
6561            (HcBackend::Bt(_), false) => {
6562                self.backend = HcBackend::Hc;
6563            }
6564            _ => {}
6565        }
6566        if resize && !self.table.hash_table.is_empty() {
6567            // Force reallocation on next ensure_tables() call.
6568            self.table.hash_table.clear();
6569            self.table.hash3_table.clear();
6570            self.table.chain_table.clear();
6571        }
6572    }
6573
6574    fn seed_dictionary_entropy(
6575        &mut self,
6576        huff: Option<&crate::huff0::huff0_encoder::HuffmanTable>,
6577        ll: Option<&crate::fse::fse_encoder::FSETable>,
6578        ml: Option<&crate::fse::fse_encoder::FSETable>,
6579        of: Option<&crate::fse::fse_encoder::FSETable>,
6580    ) {
6581        if let HcBackend::Bt(bt) = &mut self.backend {
6582            bt.opt_state.seed_dictionary_entropy(huff, ll, ml, of);
6583        }
6584    }
6585
6586    /// Install (or clear) the long-distance-match producer (#27). Only
6587    /// the BT backend owns an `ldm_producer` slot; on the HC (lazy)
6588    /// backend the producer is dropped because there is no optimal-parser
6589    /// candidate buffer to seed. Call after [`Self::reset`].
6590    #[cfg(feature = "hash")]
6591    fn set_ldm_producer(&mut self, producer: Option<super::ldm::LdmProducer>) {
6592        if let HcBackend::Bt(bt) = &mut self.backend {
6593            bt.ldm_producer = producer;
6594        }
6595    }
6596
6597    /// Move the LDM producer out of the BT backend, leaving `None`. Used by the
6598    /// dictionary snapshot path: the producer carries no dictionary state (LDM
6599    /// is not dict-primed; its hash table is empty at capture), so it is not
6600    /// retained in the snapshot — the working frame's freshly-reset producer is
6601    /// reinstated on restore instead.
6602    #[cfg(feature = "hash")]
6603    fn take_ldm_producer(&mut self) -> Option<super::ldm::LdmProducer> {
6604        if let HcBackend::Bt(bt) = &mut self.backend {
6605            bt.ldm_producer.take()
6606        } else {
6607            None
6608        }
6609    }
6610
6611    fn reset(&mut self, reuse_space: impl FnMut(Vec<u8>)) {
6612        self.table.reset(reuse_space);
6613        if let HcBackend::Bt(bt) = &mut self.backend {
6614            bt.reset();
6615        }
6616    }
6617
6618    /// Backfill positions from the tail of the previous slice that couldn't be
6619    /// hashed at the time (insert_position needs 4 bytes of lookahead).
6620    fn skip_matching(&mut self, incompressible_hint: Option<bool>) {
6621        self.table.skip_matching(incompressible_hint);
6622    }
6623
6624    /// Runtime-dispatched entry kept only for in-crate tests. Production
6625    /// callers reach the inner loops through
6626    /// [`Self::start_matching_strategy`] / [`MatchGeneratorDriver::compress_block`]
6627    /// which pick the lazy / optimal arm from `S::USE_BT` at
6628    /// monomorphisation time.
6629    #[cfg(test)]
6630    fn start_matching(&mut self, mut handle_sequence: impl for<'a> FnMut(Sequence<'a>)) {
6631        use super::strategy::{self, StrategyTag};
6632        // Dispatch on the mirrored `strategy_tag` so each test runs
6633        // under the same monomorphisation production would pick.
6634        // `BtOpt` / `BtUltra` / `BtUltra2` remain distinct here even
6635        // though `table.uses_bt` / `is_btultra2` alone can't separate
6636        // BtOpt from BtUltra.
6637        match self.strategy_tag {
6638            StrategyTag::Fast | StrategyTag::Dfast | StrategyTag::Greedy | StrategyTag::Lazy => {
6639                self.start_matching_lazy(&mut handle_sequence)
6640            }
6641            StrategyTag::Btlazy2 => self.start_matching_btlazy2(&mut handle_sequence),
6642            StrategyTag::BtOpt => {
6643                self.start_matching_optimal::<strategy::BtOpt>(&mut handle_sequence)
6644            }
6645            StrategyTag::BtUltra => {
6646                self.start_matching_optimal::<strategy::BtUltra>(&mut handle_sequence)
6647            }
6648            StrategyTag::BtUltra2 => {
6649                self.start_matching_optimal::<strategy::BtUltra2>(&mut handle_sequence)
6650            }
6651        }
6652    }
6653
6654    /// Strategy-aware entry point used by
6655    /// [`MatchGeneratorDriver::compress_block`]. Branches on
6656    /// `S::USE_BT` — a compile-time `const` — so each
6657    /// monomorphisation keeps exactly one arm: `Lazy` /
6658    /// `Fast` / `Dfast` / `Greedy` see only `start_matching_lazy`,
6659    /// `BtOpt` / `BtUltra` / `BtUltra2` see only
6660    /// `start_matching_optimal`. The inherent test-only
6661    /// [`HcMatchGenerator::start_matching`] reaches the same arms by
6662    /// runtime-matching on `self.strategy_tag` (the parse-mode field
6663    /// has been removed); production never invokes that path.
6664    pub(crate) fn start_matching_strategy<S: super::strategy::Strategy>(
6665        &mut self,
6666        handle_sequence: &mut impl for<'a> FnMut(Sequence<'a>),
6667    ) {
6668        debug_assert_eq!(
6669            self.table.uses_bt,
6670            S::USE_BT,
6671            "Strategy::USE_BT disagrees with runtime table.uses_bt at HC dispatch"
6672        );
6673        if S::USE_BT {
6674            self.start_matching_optimal::<S>(handle_sequence)
6675        } else {
6676            self.start_matching_lazy(handle_sequence)
6677        }
6678    }
6679
6680    /// Dispatcher: pick the dict-aware monomorph when a separate dms is primed
6681    /// (attach-mode dictionary), else the no-dict monomorph. Mirrors upstream's
6682    /// compile-time `dictMode` split — the `DICT = false` body carries no dms
6683    /// code at all, so the no-dict hot path is unaffected by the dict search.
6684    pub(crate) fn start_matching_lazy(
6685        &mut self,
6686        handle_sequence: impl for<'a> FnMut(Sequence<'a>),
6687    ) {
6688        if self.table.dms.is_primed() {
6689            self.start_matching_lazy_impl::<true>(handle_sequence);
6690        } else {
6691            self.start_matching_lazy_impl::<false>(handle_sequence);
6692        }
6693    }
6694
6695    fn start_matching_lazy_impl<const DICT: bool>(
6696        &mut self,
6697        mut handle_sequence: impl for<'a> FnMut(Sequence<'a>),
6698    ) {
6699        self.table.ensure_tables();
6700
6701        // `current_block_range()` is borrowed-aware: owned → last committed
6702        // chunk; borrowed → the staged in-place block range.
6703        let (current_abs_start, current_len) = self.table.current_block_range();
6704        if current_len == 0 {
6705            return;
6706        }
6707        // The current block is the tail of `history` (owned) or the staged
6708        // borrowed range (`get_last_space()` resolves both). Hoist it as a raw
6709        // slice: the routine mutates the hash/chain tables + `offset_hist` but
6710        // never reallocates `history`, so the slice stays valid and we avoid
6711        // re-borrowing `self.table` (which would conflict with the
6712        // `offset_hist` write).
6713        let current_ptr = self.table.get_last_space().as_ptr();
6714        let current: &[u8] = unsafe { core::slice::from_raw_parts(current_ptr, current_len) };
6715
6716        // Full live history (dict + committed blocks + current block), hoisted
6717        // ONCE for the whole position scan and threaded into every
6718        // `find_best_match` / `pick_lazy_match` call. `live_history()` is
6719        // loop-invariant here (the scan mutates the hash/chain tables +
6720        // `offset_hist` but never the history bytes or length), so re-fetching
6721        // it per find — inside `hash_chain_candidate` + the rep probe, plus
6722        // again for each lazy lookahead at pos+1 / pos+2 — was pure
6723        // per-position overhead. Same raw-slice detach as `current` so the
6724        // loop's `&mut self.table` inserts coexist with this `&[u8]`.
6725        let concat: &[u8] = {
6726            let lh = self.table.live_history();
6727            unsafe { core::slice::from_raw_parts(lh.as_ptr(), lh.len()) }
6728        };
6729        // Dict-match-state primed flag, hoisted ONCE for the scan: it is
6730        // block-invariant (the dict is primed before the block) and lives on the
6731        // cold `dms` cacheline, so the per-find `dms.is_primed()` load was a
6732        // measurable hot-path cost (~8% of `hash_chain_candidate` on the
6733        // dict-over-random fixture). The `DICT = false` monomorph ignores it.
6734        let dms_primed = self.table.dms.is_primed();
6735
6736        let current_abs_end = current_abs_start + current_len;
6737        self.table
6738            .backfill_boundary_positions(current_abs_start, current_abs_end);
6739
6740        let mut pos = 0usize;
6741        let mut literals_start = 0usize;
6742        while pos + HC_MIN_MATCH_LEN <= current_len {
6743            let abs_pos = current_abs_start + pos;
6744            let lit_len = pos - literals_start;
6745
6746            // `find_best_match` returns the forward `(offset, length)` in
6747            // registers (`HcMatch`, 16 bytes) — no 24-byte `MatchCandidate` /
6748            // 32-byte `Option` spilled-and-copied per position. The backward
6749            // extension that yields `start` runs ONCE here, after the lazy
6750            // decision settles, exactly like upstream's lazy loop.
6751            let best =
6752                self.hc
6753                    .find_best_match::<DICT>(concat, dms_primed, &self.table, abs_pos, lit_len);
6754            if best.is_match() {
6755                if self.hc.pick_lazy_match::<DICT>(
6756                    concat,
6757                    dms_primed,
6758                    &self.table,
6759                    abs_pos,
6760                    lit_len,
6761                    best,
6762                ) {
6763                    // Backward-extend over the literal run (upstream `zstd_lazy.c`
6764                    // after rep-vs-chain selection). The offset is preserved;
6765                    // `start` and `match_len` grow by the same amount, bounded by
6766                    // `literals_start` (the `min_abs` floor) so it never crosses
6767                    // an already-emitted sequence.
6768                    let history_abs_start = self.table.history_abs_start;
6769                    let min_abs = abs_pos - lit_len;
6770                    let mut start_abs = abs_pos;
6771                    let mut cand_abs = abs_pos - best.offset;
6772                    let mut match_len = best.match_len;
6773                    while start_abs > min_abs
6774                        && cand_abs > history_abs_start
6775                        && concat[cand_abs - history_abs_start - 1]
6776                            == concat[start_abs - history_abs_start - 1]
6777                    {
6778                        start_abs -= 1;
6779                        cand_abs -= 1;
6780                        match_len += 1;
6781                    }
6782                    self.table.insert_match_span(abs_pos, start_abs + match_len);
6783                    let start = start_abs - current_abs_start;
6784                    let literals = &current[literals_start..start];
6785                    handle_sequence(Sequence::Triple {
6786                        literals,
6787                        offset: best.offset,
6788                        match_len,
6789                    });
6790                    let _ = encode_offset_with_history(
6791                        best.offset as u32,
6792                        literals.len() as u32,
6793                        &mut self.table.offset_hist,
6794                    );
6795                    pos = start + match_len;
6796                    literals_start = pos;
6797                    continue;
6798                }
6799                // Lazy lookahead found a better match at `abs_pos + 1` / `+ 2`
6800                // (defer): advance exactly ONE byte (upstream
6801                // `ZSTD_compressBlock_lazy_generic`) so the deferred candidate is
6802                // re-evaluated at its own position; the no-match skip below could
6803                // jump past it once the literal run reaches 256+ bytes.
6804                self.table.insert_position(abs_pos);
6805                pos += 1;
6806                continue;
6807            }
6808            // No match found.
6809            self.table.insert_position(abs_pos);
6810            // Lazy skipping (upstream zstd `ZSTD_compressBlock_lazy_generic`,
6811            // zstd_lazy.c:1614): advance faster over runs with no match.
6812            // `step = ((ip - anchor) >> kSearchStrength) + 1` with
6813            // kSearchStrength = 8, where `ip - anchor` is the current
6814            // literal-run length. On compressible input the run stays short
6815            // (step == 1, identical to a 1-byte advance); on incompressible
6816            // / dict-over-random input the run grows so the parser skips
6817            // ahead (one search per `step` positions) instead of searching
6818            // every byte. Skipped positions are not inserted, mirroring
6819            // upstream (it inserts only searched positions during a no-match
6820            // run). Ratio follows upstream (not byte-identical).
6821            let step = ((pos - literals_start) >> 8) + 1;
6822            pos += step;
6823            // No clamp needed before the tail loop: the search bound and the
6824            // hashable bound are both `pos + HC_MIN_MATCH_LEN <= current_len`
6825            // (HC_MIN_MATCH_LEN == 4 == the insert width), so there is no
6826            // non-searchable-but-hashable anchor to miss. Positions the skip
6827            // jumps over inside the searchable region are intentionally not
6828            // inserted — same as upstream zstd, which advances past them via
6829            // the identical `ip += step` and never hashes them either.
6830        }
6831
6832        // Insert remaining hashable positions in the tail (the matching loop
6833        // stops at HC_MIN_MATCH_LEN but insert_position only needs 4 bytes).
6834        while pos + 4 <= current_len {
6835            self.table.insert_position(current_abs_start + pos);
6836            pos += 1;
6837        }
6838
6839        if literals_start < current_len {
6840            handle_sequence(Sequence::Literals {
6841                literals: &current[literals_start..],
6842            });
6843        }
6844    }
6845
6846    /// Register the borrowed input window for the no-copy one-shot path.
6847    /// # Safety
6848    /// `buffer` must outlive the borrowed scans (see `MatchTable`).
6849    pub(crate) unsafe fn set_borrowed_window(&mut self, buffer: &[u8]) {
6850        // SAFETY: forwarded liveness contract.
6851        unsafe { self.table.set_borrowed_window(buffer) };
6852    }
6853
6854    pub(crate) fn clear_borrowed_window(&mut self) {
6855        self.table.clear_borrowed_window();
6856    }
6857
6858    /// Borrowed (no-copy) equivalent of [`Self::start_matching_lazy`]: stage
6859    /// the in-place block range, then run the same lazy chain parse. The
6860    /// parse reads its range via `current_block_range()` and its bytes via
6861    /// `get_last_space()` / `live_history()`, all borrowed-aware, so the block
6862    /// is scanned in place with the per-position window_low offset cap.
6863    pub(crate) fn start_matching_lazy_borrowed(
6864        &mut self,
6865        block_start: usize,
6866        block_end: usize,
6867        handle_sequence: impl for<'a> FnMut(Sequence<'a>),
6868    ) {
6869        self.table.stage_borrowed_block(block_start, block_end);
6870        self.start_matching_lazy(handle_sequence);
6871    }
6872
6873    /// Borrowed (no-copy) equivalent of the lazy `skip_matching`: stage the
6874    /// in-place block, then seed positions without an owned-history append.
6875    pub(crate) fn skip_matching_borrowed(
6876        &mut self,
6877        block_start: usize,
6878        block_end: usize,
6879        incompressible_hint: Option<bool>,
6880    ) {
6881        self.table.stage_borrowed_block(block_start, block_end);
6882        self.table.skip_matching(incompressible_hint);
6883    }
6884
6885    /// Upstream zstd `ZSTD_btlazy2` (levels 13-15): binary-tree match finder with a
6886    /// greedy/lazy parse. Bare dispatcher — resolves the runtime tier ONCE
6887    /// per block via `select_kernel()` and calls the matching
6888    /// `start_matching_btlazy2_<kernel>` wrapper, so the per-position BT
6889    /// collect runs under a single `#[target_feature]` umbrella (mirrors
6890    /// `build_optimal_plan_impl`). See `start_matching_btlazy2_body!` for the
6891    /// shared loop.
6892    fn start_matching_btlazy2(&mut self, mut handle_sequence: impl for<'a> FnMut(Sequence<'a>)) {
6893        #[cfg(all(target_arch = "aarch64", target_endian = "little"))]
6894        unsafe {
6895            self.start_matching_btlazy2_neon(&mut handle_sequence)
6896        }
6897        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
6898        {
6899            use crate::encoding::fastpath::{FastpathKernel, select_kernel};
6900            match select_kernel() {
6901                FastpathKernel::Avx2Bmi2 => unsafe {
6902                    self.start_matching_btlazy2_avx2_bmi2(&mut handle_sequence)
6903                },
6904                FastpathKernel::Sse42 => unsafe {
6905                    self.start_matching_btlazy2_sse42(&mut handle_sequence)
6906                },
6907                FastpathKernel::Scalar => self.start_matching_btlazy2_scalar(&mut handle_sequence),
6908            }
6909        }
6910        #[cfg(not(any(
6911            all(target_arch = "aarch64", target_endian = "little"),
6912            target_arch = "x86",
6913            target_arch = "x86_64"
6914        )))]
6915        {
6916            self.start_matching_btlazy2_scalar(&mut handle_sequence)
6917        }
6918    }
6919
6920    #[cfg(all(target_arch = "aarch64", target_endian = "little"))]
6921    #[target_feature(enable = "neon")]
6922    unsafe fn start_matching_btlazy2_neon(
6923        &mut self,
6924        mut handle_sequence: impl for<'a> FnMut(Sequence<'a>),
6925    ) {
6926        start_matching_btlazy2_body!(
6927            self,
6928            handle_sequence,
6929            collect_optimal_candidates_initialized_neon,
6930            crate::encoding::fastpath::neon::count_match_from_indices
6931        )
6932    }
6933
6934    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
6935    #[target_feature(enable = "sse4.2")]
6936    unsafe fn start_matching_btlazy2_sse42(
6937        &mut self,
6938        mut handle_sequence: impl for<'a> FnMut(Sequence<'a>),
6939    ) {
6940        start_matching_btlazy2_body!(
6941            self,
6942            handle_sequence,
6943            collect_optimal_candidates_initialized_sse42,
6944            crate::encoding::fastpath::sse42::count_match_from_indices
6945        )
6946    }
6947
6948    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
6949    #[target_feature(enable = "avx2,bmi2")]
6950    unsafe fn start_matching_btlazy2_avx2_bmi2(
6951        &mut self,
6952        mut handle_sequence: impl for<'a> FnMut(Sequence<'a>),
6953    ) {
6954        start_matching_btlazy2_body!(
6955            self,
6956            handle_sequence,
6957            collect_optimal_candidates_initialized_avx2_bmi2,
6958            crate::encoding::fastpath::avx2_bmi2::count_match_from_indices
6959        )
6960    }
6961
6962    // Scalar wrapper: no `#[target_feature]`; `$collect` (the scalar collect)
6963    // is a safe fn, so the body macro's `unsafe` block is inert here. Same cfg
6964    // as `collect_optimal_candidates_initialized_scalar` (absent on
6965    // aarch64-little, where NEON is the baseline tier).
6966    #[cfg(not(all(target_arch = "aarch64", target_endian = "little")))]
6967    #[allow(unused_unsafe)]
6968    fn start_matching_btlazy2_scalar(
6969        &mut self,
6970        mut handle_sequence: impl for<'a> FnMut(Sequence<'a>),
6971    ) {
6972        start_matching_btlazy2_body!(
6973            self,
6974            handle_sequence,
6975            collect_optimal_candidates_initialized_scalar,
6976            crate::encoding::fastpath::scalar::count_match_from_indices
6977        )
6978    }
6979
6980    fn start_matching_optimal<S: super::strategy::Strategy>(
6981        &mut self,
6982        mut handle_sequence: impl for<'a> FnMut(Sequence<'a>),
6983    ) {
6984        self.table.ensure_tables();
6985        // Borrowed-aware: owned → last committed chunk; borrowed → staged
6986        // in-place block range.
6987        let (current_abs_start, current_len) = self.table.current_block_range();
6988        if current_len == 0 {
6989            return;
6990        }
6991        let current_ptr = self.table.get_last_space().as_ptr();
6992        // `start_matching_optimal()` mutates tables/state but never mutates or
6993        // reallocates `self.table.history`, so this tail slice remains valid for
6994        // the duration of the routine and avoids cloning the full block.
6995        let current = unsafe { core::slice::from_raw_parts(current_ptr, current_len) };
6996
6997        let current_abs_end = current_abs_start + current_len;
6998        self.table
6999            .apply_limited_update_after_long_match(current_abs_start);
7000        let hash3_start_cursor = self
7001            .table
7002            .skip_insert_until_abs
7003            .max(self.table.history_abs_start);
7004        self.table
7005            .backfill_boundary_positions(current_abs_start, current_abs_end);
7006        self.table.next_to_update3 = hash3_start_cursor;
7007        // Borrow split: `prepare_ldm_candidates` needs immutable
7008        // access to the live history (the post-`history_start`
7009        // slice of `self.table.history`) while it mutates the LDM
7010        // bucket table owned by `self.backend.bt_mut()`. Both live
7011        // in disjoint fields of `Self`, so we capture the slice +
7012        // its base before reaching for `bt_mut()`.
7013        //
7014        // The producer operates in absolute stream coordinates
7015        // throughout; `live_history[0]` corresponds to absolute
7016        // `history_abs_start` (upstream zstd `base + dictLimit`), and the
7017        // abs→slice translation happens inside the producer at
7018        // each `live_history[..]` access. Passing the full
7019        // `history` Vec would index into the dead prefix (the
7020        // bytes already retired past `history_start`).
7021        let live_history = self.table.live_history();
7022        let history_abs_start = self.table.history_abs_start;
7023        self.backend.bt_mut().prepare_ldm_candidates(
7024            live_history,
7025            history_abs_start,
7026            current_abs_start,
7027            current_len,
7028        );
7029
7030        if self.should_run_btultra2_seed_pass::<S>(current_len) {
7031            self.run_btultra2_seed_pass(current, current_abs_start, current_len);
7032        }
7033
7034        // Const-generic profile selection: every field is folded from
7035        // S's associated consts (MAX_CHAIN_DEPTH /
7036        // SUFFICIENT_MATCH_LEN / ACCURATE_PRICE / FAVOR_SMALL_OFFSETS),
7037        // so the optimiser produces the literal at codegen time
7038        // without a runtime match.
7039        let profile = HcOptimalCostProfile::const_for_strategy::<S>();
7040        let mut opt_state =
7041            core::mem::replace(&mut self.backend.bt_mut().opt_state, HcOptState::new());
7042        opt_state.rescale_freqs(current, profile);
7043        let mut best_plan = core::mem::take(&mut self.backend.bt_mut().opt_segment_plan_scratch);
7044        best_plan.clear();
7045        let mut plan_reps = self.table.offset_hist;
7046        let (mut cursor, mut plan_litlen) =
7047            self.table.opt_start_cursor_and_litlen(current_abs_start);
7048        let mut plan_literals_cursor = 0usize;
7049        let match_loop_limit = current_len.saturating_sub(8);
7050        while cursor < match_loop_limit {
7051            let remaining_len = current_len - cursor;
7052            let segment_abs_start = current_abs_start + cursor;
7053            let segment_start = best_plan.len();
7054            let (_, end_reps, end_litlen, consumed_len) = self.build_optimal_plan::<S>(
7055                &current[cursor..],
7056                segment_abs_start,
7057                remaining_len,
7058                HcOptimalPlanState {
7059                    block_offset: cursor,
7060                    reps: plan_reps,
7061                    litlen: plan_litlen,
7062                    profile,
7063                },
7064                &opt_state,
7065                &mut best_plan,
7066            );
7067            BtMatcher::update_plan_stats_segment(
7068                current,
7069                current_len,
7070                &best_plan[segment_start..],
7071                &mut plan_literals_cursor,
7072                &mut plan_reps,
7073                &mut opt_state,
7074                profile.accurate,
7075            );
7076            plan_reps = end_reps;
7077            plan_litlen = end_litlen;
7078            cursor += consumed_len;
7079        }
7080
7081        self.table
7082            .emit_optimal_plan(current_len, &best_plan, &mut handle_sequence);
7083        best_plan.clear();
7084        self.backend.bt_mut().opt_segment_plan_scratch = best_plan;
7085        self.backend.bt_mut().opt_state = opt_state;
7086    }
7087
7088    fn run_btultra2_seed_pass(
7089        &mut self,
7090        current: &[u8],
7091        current_abs_start: usize,
7092        current_len: usize,
7093    ) {
7094        // The seed pass is BtUltra2-exclusive by name (the only
7095        // caller is `should_run_btultra2_seed_pass`), so pin `S` to
7096        // `BtUltra2` for both the cost-profile lookup and the
7097        // `build_optimal_plan::<S>` call below.
7098        type S = super::strategy::BtUltra2;
7099        let seed_profile = HcOptimalCostProfile::const_for_strategy::<S>();
7100        let mut opt_state =
7101            core::mem::replace(&mut self.backend.bt_mut().opt_state, HcOptState::new());
7102        opt_state.rescale_freqs(current, seed_profile);
7103        let mut seed_reps = self.table.offset_hist;
7104        let (mut cursor, mut seed_litlen) =
7105            self.table.opt_start_cursor_and_litlen(current_abs_start);
7106        let mut seed_literals_cursor = 0usize;
7107        let mut seed_plan = core::mem::take(&mut self.backend.bt_mut().opt_seed_plan_scratch);
7108        seed_plan.clear();
7109        let match_loop_limit = current_len.saturating_sub(8);
7110        while cursor < match_loop_limit {
7111            let remaining_len = current_len - cursor;
7112            let segment_abs_start = current_abs_start + cursor;
7113            let segment_start = seed_plan.len();
7114            let (_, end_reps, end_litlen, consumed_len) = self.build_optimal_plan::<S>(
7115                &current[cursor..],
7116                segment_abs_start,
7117                remaining_len,
7118                HcOptimalPlanState {
7119                    block_offset: cursor,
7120                    reps: seed_reps,
7121                    litlen: seed_litlen,
7122                    profile: seed_profile,
7123                },
7124                &opt_state,
7125                &mut seed_plan,
7126            );
7127            BtMatcher::update_plan_stats_segment(
7128                current,
7129                current_len,
7130                &seed_plan[segment_start..],
7131                &mut seed_literals_cursor,
7132                &mut seed_reps,
7133                &mut opt_state,
7134                seed_profile.accurate,
7135            );
7136            seed_plan.truncate(segment_start);
7137            seed_reps = end_reps;
7138            seed_litlen = end_litlen;
7139            cursor += consumed_len;
7140        }
7141        seed_plan.clear();
7142        self.backend.bt_mut().opt_seed_plan_scratch = seed_plan;
7143        self.backend.bt_mut().opt_state = opt_state;
7144
7145        // Upstream zstd initStats_ultra keeps the collected entropy statistics but
7146        // invalidates the first-pass matchfinder history before the real pass.
7147        self.table.position_base = self.table.history_abs_start;
7148        self.table.index_shift = current_len;
7149        self.table.next_to_update3 = current_abs_start;
7150        self.table.skip_insert_until_abs = current_abs_start;
7151        // Upstream zstd `ZSTD_initStats_ultra()` invalidates the first scan by moving
7152        // `window.base` back by `srcSize`, making the real pass start at
7153        // `curr == srcSize` instead of 0. Position 0 is therefore a valid
7154        // table entry in the second pass even though raw C tables reserve
7155        // value 0 as empty during an unshifted first pass.
7156        self.table.allow_zero_relative_position = true;
7157    }
7158
7159    fn build_optimal_plan<S: super::strategy::Strategy>(
7160        &mut self,
7161        current: &[u8],
7162        current_abs_start: usize,
7163        current_len: usize,
7164        initial_state: HcOptimalPlanState,
7165        stats: &HcOptState,
7166        out: &mut Vec<HcOptimalSequence>,
7167    ) -> (u32, [u32; 3], usize, usize) {
7168        debug_assert!(S::USE_BT, "build_optimal_plan called on non-BT strategy");
7169        debug_assert_eq!(initial_state.profile.accurate, S::ACCURATE_PRICE);
7170        debug_assert_eq!(
7171            initial_state.profile.favor_small_offsets,
7172            S::FAVOR_SMALL_OFFSETS
7173        );
7174        // `S::ACCURATE_PRICE` / `S::FAVOR_SMALL_OFFSETS` cannot appear
7175        // as const-generic arguments yet (`generic_const_exprs` is
7176        // still unstable), so dispatch over a 4-arm match — but on the
7177        // strategy's ASSOCIATED CONSTS, not the runtime profile (the
7178        // `debug_assert_eq`s above pin the runtime profile to those
7179        // consts). A const scrutinee folds the three dead arms at
7180        // monomorphisation; matching the runtime profile instead kept
7181        // all four `#[inline(always)]` DP bodies (~16 KB each) alive in
7182        // EVERY `S` instantiation — ~360 KB of the wasm payload.
7183        match (S::ACCURATE_PRICE, S::FAVOR_SMALL_OFFSETS) {
7184            (true, false) => self.build_optimal_plan_impl::<S, true, false>(
7185                current,
7186                current_abs_start,
7187                current_len,
7188                initial_state,
7189                stats,
7190                out,
7191            ),
7192            (true, true) => self.build_optimal_plan_impl::<S, true, true>(
7193                current,
7194                current_abs_start,
7195                current_len,
7196                initial_state,
7197                stats,
7198                out,
7199            ),
7200            (false, false) => self.build_optimal_plan_impl::<S, false, false>(
7201                current,
7202                current_abs_start,
7203                current_len,
7204                initial_state,
7205                stats,
7206                out,
7207            ),
7208            (false, true) => self.build_optimal_plan_impl::<S, false, true>(
7209                current,
7210                current_abs_start,
7211                current_len,
7212                initial_state,
7213                stats,
7214                out,
7215            ),
7216        }
7217    }
7218
7219    /// Cross-platform DP entry. Picks the kernel-specific variant so the
7220    /// entire optimal-parser DP body (per-position match gathering, price
7221    /// updates, traceback) runs inside a single `target_feature` umbrella
7222    /// alongside the per-position `collect_optimal_candidates_initialized_
7223    /// <kernel>`. This eliminates the final ABI barrier on the hot per-
7224    /// position match-collection call — the level22 critical path is now
7225    /// one straight-line inline chain from DP body down through BT walk
7226    /// and match-length probes.
7227    #[inline(always)]
7228    fn build_optimal_plan_impl<
7229        S: super::strategy::Strategy,
7230        const ACCURATE_PRICE: bool,
7231        const FAVOR_SMALL_OFFSETS: bool,
7232    >(
7233        &mut self,
7234        current: &[u8],
7235        current_abs_start: usize,
7236        current_len: usize,
7237        initial_state: HcOptimalPlanState,
7238        stats: &HcOptState,
7239        out: &mut Vec<HcOptimalSequence>,
7240    ) -> (u32, [u32; 3], usize, usize) {
7241        #[cfg(all(target_arch = "aarch64", target_endian = "little"))]
7242        unsafe {
7243            self.build_optimal_plan_impl_neon::<S, ACCURATE_PRICE, FAVOR_SMALL_OFFSETS>(
7244                current,
7245                current_abs_start,
7246                current_len,
7247                initial_state,
7248                stats,
7249                out,
7250            )
7251        }
7252        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
7253        {
7254            use crate::encoding::fastpath::{FastpathKernel, select_kernel};
7255            match select_kernel() {
7256                FastpathKernel::Avx2Bmi2 => unsafe {
7257                    self.build_optimal_plan_impl_avx2_bmi2::<S, ACCURATE_PRICE, FAVOR_SMALL_OFFSETS>(
7258                        current,
7259                        current_abs_start,
7260                        current_len,
7261                        initial_state,
7262                        stats,
7263                        out,
7264                    )
7265                },
7266                FastpathKernel::Sse42 => unsafe {
7267                    self.build_optimal_plan_impl_sse42::<S, ACCURATE_PRICE, FAVOR_SMALL_OFFSETS>(
7268                        current,
7269                        current_abs_start,
7270                        current_len,
7271                        initial_state,
7272                        stats,
7273                        out,
7274                    )
7275                },
7276                FastpathKernel::Scalar => self
7277                    .build_optimal_plan_impl_scalar::<S, ACCURATE_PRICE, FAVOR_SMALL_OFFSETS>(
7278                        current,
7279                        current_abs_start,
7280                        current_len,
7281                        initial_state,
7282                        stats,
7283                        out,
7284                    ),
7285            }
7286        }
7287        // wasm with simd128: route through the simd128 DP body (4-lane price-set).
7288        #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
7289        unsafe {
7290            self.build_optimal_plan_impl_simd128::<S, ACCURATE_PRICE, FAVOR_SMALL_OFFSETS>(
7291                current,
7292                current_abs_start,
7293                current_len,
7294                initial_state,
7295                stats,
7296                out,
7297            )
7298        }
7299        #[cfg(not(any(
7300            all(target_arch = "aarch64", target_endian = "little"),
7301            target_arch = "x86",
7302            target_arch = "x86_64",
7303            all(target_arch = "wasm32", target_feature = "simd128")
7304        )))]
7305        {
7306            self.build_optimal_plan_impl_scalar::<S, ACCURATE_PRICE, FAVOR_SMALL_OFFSETS>(
7307                current,
7308                current_abs_start,
7309                current_len,
7310                initial_state,
7311                stats,
7312                out,
7313            )
7314        }
7315    }
7316
7317    /// NEON-umbrella DP body. Inlines
7318    /// `collect_optimal_candidates_initialized_neon` (and its entire
7319    /// per-position pipeline) directly into the DP loop.
7320    #[cfg(all(target_arch = "aarch64", target_endian = "little"))]
7321    #[target_feature(enable = "neon")]
7322    unsafe fn build_optimal_plan_impl_neon<
7323        S: super::strategy::Strategy,
7324        const ACCURATE_PRICE: bool,
7325        const FAVOR_SMALL_OFFSETS: bool,
7326    >(
7327        &mut self,
7328        current: &[u8],
7329        current_abs_start: usize,
7330        current_len: usize,
7331        initial_state: HcOptimalPlanState,
7332        stats: &HcOptState,
7333        out: &mut Vec<HcOptimalSequence>,
7334    ) -> (u32, [u32; 3], usize, usize) {
7335        build_optimal_plan_impl_body!(
7336            self,
7337            S,
7338            current,
7339            current_abs_start,
7340            current_len,
7341            initial_state,
7342            stats,
7343            out,
7344            collect_optimal_candidates_initialized_neon,
7345            priceset_range_nonabort_neon,
7346        )
7347    }
7348
7349    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
7350    #[target_feature(enable = "sse4.2")]
7351    unsafe fn build_optimal_plan_impl_sse42<
7352        S: super::strategy::Strategy,
7353        const ACCURATE_PRICE: bool,
7354        const FAVOR_SMALL_OFFSETS: bool,
7355    >(
7356        &mut self,
7357        current: &[u8],
7358        current_abs_start: usize,
7359        current_len: usize,
7360        initial_state: HcOptimalPlanState,
7361        stats: &HcOptState,
7362        out: &mut Vec<HcOptimalSequence>,
7363    ) -> (u32, [u32; 3], usize, usize) {
7364        build_optimal_plan_impl_body!(
7365            self,
7366            S,
7367            current,
7368            current_abs_start,
7369            current_len,
7370            initial_state,
7371            stats,
7372            out,
7373            collect_optimal_candidates_initialized_sse42,
7374            priceset_range_nonabort_sse41,
7375        )
7376    }
7377
7378    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
7379    #[target_feature(enable = "avx2,bmi2")]
7380    unsafe fn build_optimal_plan_impl_avx2_bmi2<
7381        S: super::strategy::Strategy,
7382        const ACCURATE_PRICE: bool,
7383        const FAVOR_SMALL_OFFSETS: bool,
7384    >(
7385        &mut self,
7386        current: &[u8],
7387        current_abs_start: usize,
7388        current_len: usize,
7389        initial_state: HcOptimalPlanState,
7390        stats: &HcOptState,
7391        out: &mut Vec<HcOptimalSequence>,
7392    ) -> (u32, [u32; 3], usize, usize) {
7393        build_optimal_plan_impl_body!(
7394            self,
7395            S,
7396            current,
7397            current_abs_start,
7398            current_len,
7399            initial_state,
7400            stats,
7401            out,
7402            collect_optimal_candidates_initialized_avx2_bmi2,
7403            priceset_range_nonabort_avx2,
7404        )
7405    }
7406
7407    #[cfg(not(all(target_arch = "aarch64", target_endian = "little")))]
7408    // Body macros wrap callees in `unsafe { }` for the NEON/AVX/SSE
7409    // variants where callees are `unsafe fn`. The scalar wrappers route
7410    // through safe fns, so those blocks are redundant on this path.
7411    #[allow(unused_unsafe)]
7412    // The dispatch reaches this only on non-SIMD x86 (Scalar tier) and the
7413    // portable fallback; on wasm+simd128 the simd128 wrapper is selected, so
7414    // this is cfg-dead there.
7415    #[cfg_attr(
7416        all(target_arch = "wasm32", target_feature = "simd128"),
7417        allow(dead_code)
7418    )]
7419    fn build_optimal_plan_impl_scalar<
7420        S: super::strategy::Strategy,
7421        const ACCURATE_PRICE: bool,
7422        const FAVOR_SMALL_OFFSETS: bool,
7423    >(
7424        &mut self,
7425        current: &[u8],
7426        current_abs_start: usize,
7427        current_len: usize,
7428        initial_state: HcOptimalPlanState,
7429        stats: &HcOptState,
7430        out: &mut Vec<HcOptimalSequence>,
7431    ) -> (u32, [u32; 3], usize, usize) {
7432        build_optimal_plan_impl_body!(
7433            self,
7434            S,
7435            current,
7436            current_abs_start,
7437            current_len,
7438            initial_state,
7439            stats,
7440            out,
7441            collect_optimal_candidates_initialized_scalar,
7442            priceset_range_nonabort_scalar,
7443        )
7444    }
7445
7446    /// wasm `simd128`-umbrella DP body: scalar candidate collection (no wasm
7447    /// collect kernel) but the simd128 4-lane price-set.
7448    #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
7449    #[target_feature(enable = "simd128")]
7450    // With `+simd128` in the wasm baseline the shared body macro's `unsafe`
7451    // blocks (needed by the safe scalar wrapper) are redundant inside this
7452    // target_feature fn.
7453    #[allow(unused_unsafe)]
7454    unsafe fn build_optimal_plan_impl_simd128<
7455        S: super::strategy::Strategy,
7456        const ACCURATE_PRICE: bool,
7457        const FAVOR_SMALL_OFFSETS: bool,
7458    >(
7459        &mut self,
7460        current: &[u8],
7461        current_abs_start: usize,
7462        current_len: usize,
7463        initial_state: HcOptimalPlanState,
7464        stats: &HcOptState,
7465        out: &mut Vec<HcOptimalSequence>,
7466    ) -> (u32, [u32; 3], usize, usize) {
7467        build_optimal_plan_impl_body!(
7468            self,
7469            S,
7470            current,
7471            current_abs_start,
7472            current_len,
7473            initial_state,
7474            stats,
7475            out,
7476            collect_optimal_candidates_initialized_scalar,
7477            priceset_range_nonabort_simd128,
7478        )
7479    }
7480
7481    #[cfg(test)]
7482    fn collect_optimal_candidates(
7483        &mut self,
7484        abs_pos: usize,
7485        current_abs_end: usize,
7486        profile: HcOptimalCostProfile,
7487        query: HcCandidateQuery,
7488        out: &mut Vec<MatchCandidate>,
7489    ) {
7490        use super::strategy::{self, StrategyTag};
7491        self.table.ensure_tables();
7492        // Dispatch purely from `self.strategy_tag` (set by
7493        // `configure()`). Tests must configure the matcher the same
7494        // way production does — wiring up `table.hash3_log` directly
7495        // without setting a matching `strategy_tag` is no longer
7496        // allowed.
7497        match self.strategy_tag {
7498            StrategyTag::BtUltra2 => self
7499                .collect_optimal_candidates_initialized::<strategy::BtUltra2, true>(
7500                    abs_pos,
7501                    current_abs_end,
7502                    profile,
7503                    query,
7504                    out,
7505                ),
7506            StrategyTag::BtUltra => self
7507                .collect_optimal_candidates_initialized::<strategy::BtUltra, true>(
7508                    abs_pos,
7509                    current_abs_end,
7510                    profile,
7511                    query,
7512                    out,
7513                ),
7514            StrategyTag::Btlazy2 => self
7515                .collect_optimal_candidates_initialized::<strategy::Btlazy2, true>(
7516                    abs_pos,
7517                    current_abs_end,
7518                    profile,
7519                    query,
7520                    out,
7521                ),
7522            StrategyTag::BtOpt => self
7523                .collect_optimal_candidates_initialized::<strategy::BtOpt, true>(
7524                    abs_pos,
7525                    current_abs_end,
7526                    profile,
7527                    query,
7528                    out,
7529                ),
7530            StrategyTag::Fast | StrategyTag::Dfast | StrategyTag::Greedy | StrategyTag::Lazy => {
7531                self.collect_optimal_candidates_initialized::<strategy::Lazy, false>(
7532                    abs_pos,
7533                    current_abs_end,
7534                    profile,
7535                    query,
7536                    out,
7537                )
7538            }
7539        }
7540    }
7541
7542    /// Cross-platform entry. Picks the kernel-specific variant so the per-
7543    /// position pipeline (BT-tree fill, rep probing, hash3 probing, BT
7544    /// collect / HC chain walk) runs inside a single `target_feature`
7545    /// umbrella — all inner SIMD probes inline without ABI barriers.
7546    ///
7547    /// The on-encode hot path bypasses this dispatcher: `build_optimal_plan_impl_<kernel>`
7548    /// calls the matching `_<kernel>` variant directly. This entry is kept
7549    /// for the cfg(test)-only `collect_optimal_candidates` shim and any
7550    /// future caller that isn't already inside a kernel umbrella.
7551    #[allow(dead_code)]
7552    #[inline(always)]
7553    fn collect_optimal_candidates_initialized<
7554        S: super::strategy::Strategy,
7555        const USE_BT_MATCHFINDER: bool,
7556    >(
7557        &mut self,
7558        abs_pos: usize,
7559        current_abs_end: usize,
7560        profile: HcOptimalCostProfile,
7561        query: HcCandidateQuery,
7562        out: &mut Vec<MatchCandidate>,
7563    ) {
7564        #[cfg(all(target_arch = "aarch64", target_endian = "little"))]
7565        unsafe {
7566            self.collect_optimal_candidates_initialized_neon::<S, USE_BT_MATCHFINDER>(
7567                abs_pos,
7568                current_abs_end,
7569                profile,
7570                query,
7571                out,
7572            )
7573        }
7574        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
7575        {
7576            use crate::encoding::fastpath::{FastpathKernel, select_kernel};
7577            match select_kernel() {
7578                FastpathKernel::Avx2Bmi2 => unsafe {
7579                    self.collect_optimal_candidates_initialized_avx2_bmi2::<S, USE_BT_MATCHFINDER>(
7580                        abs_pos,
7581                        current_abs_end,
7582                        profile,
7583                        query,
7584                        out,
7585                    )
7586                },
7587                FastpathKernel::Sse42 => unsafe {
7588                    self.collect_optimal_candidates_initialized_sse42::<S, USE_BT_MATCHFINDER>(
7589                        abs_pos,
7590                        current_abs_end,
7591                        profile,
7592                        query,
7593                        out,
7594                    )
7595                },
7596                FastpathKernel::Scalar => self
7597                    .collect_optimal_candidates_initialized_scalar::<S, USE_BT_MATCHFINDER>(
7598                        abs_pos,
7599                        current_abs_end,
7600                        profile,
7601                        query,
7602                        out,
7603                    ),
7604            }
7605        }
7606        #[cfg(not(any(
7607            all(target_arch = "aarch64", target_endian = "little"),
7608            target_arch = "x86",
7609            target_arch = "x86_64"
7610        )))]
7611        {
7612            self.collect_optimal_candidates_initialized_scalar::<S, USE_BT_MATCHFINDER>(
7613                abs_pos,
7614                current_abs_end,
7615                profile,
7616                query,
7617                out,
7618            )
7619        }
7620    }
7621
7622    /// NEON-umbrella variant. Every inner helper (`bt_update_tree_until_neon`,
7623    /// `for_each_repcode_candidate_with_reps_neon`, `hash3_candidate_neon`,
7624    /// `bt_insert_and_collect_matches_neon`, `fastpath::neon::
7625    /// common_prefix_len_ptr`) shares the NEON umbrella so the per-position
7626    /// pipeline executes as a single straight-line inline sequence.
7627    #[cfg(all(target_arch = "aarch64", target_endian = "little"))]
7628    #[target_feature(enable = "neon")]
7629    unsafe fn collect_optimal_candidates_initialized_neon<
7630        S: super::strategy::Strategy,
7631        const USE_BT_MATCHFINDER: bool,
7632    >(
7633        &mut self,
7634        abs_pos: usize,
7635        current_abs_end: usize,
7636        profile: HcOptimalCostProfile,
7637        query: HcCandidateQuery,
7638        out: &mut Vec<MatchCandidate>,
7639    ) {
7640        collect_optimal_candidates_initialized_body!(
7641            self,
7642            S,
7643            abs_pos,
7644            current_abs_end,
7645            profile,
7646            query,
7647            out,
7648            USE_BT_MATCHFINDER,
7649            bt_update_tree_until_neon,
7650            bt_insert_and_collect_matches_neon,
7651            for_each_repcode_candidate_with_reps_neon,
7652            hash3_candidate_neon,
7653            crate::encoding::fastpath::neon::common_prefix_len_ptr,
7654        )
7655    }
7656
7657    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
7658    #[target_feature(enable = "sse4.2")]
7659    unsafe fn collect_optimal_candidates_initialized_sse42<
7660        S: super::strategy::Strategy,
7661        const USE_BT_MATCHFINDER: bool,
7662    >(
7663        &mut self,
7664        abs_pos: usize,
7665        current_abs_end: usize,
7666        profile: HcOptimalCostProfile,
7667        query: HcCandidateQuery,
7668        out: &mut Vec<MatchCandidate>,
7669    ) {
7670        collect_optimal_candidates_initialized_body!(
7671            self,
7672            S,
7673            abs_pos,
7674            current_abs_end,
7675            profile,
7676            query,
7677            out,
7678            USE_BT_MATCHFINDER,
7679            bt_update_tree_until_sse42,
7680            bt_insert_and_collect_matches_sse42,
7681            for_each_repcode_candidate_with_reps_sse42,
7682            hash3_candidate_sse42,
7683            crate::encoding::fastpath::sse42::common_prefix_len_ptr,
7684        )
7685    }
7686
7687    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
7688    #[target_feature(enable = "avx2,bmi2")]
7689    unsafe fn collect_optimal_candidates_initialized_avx2_bmi2<
7690        S: super::strategy::Strategy,
7691        const USE_BT_MATCHFINDER: bool,
7692    >(
7693        &mut self,
7694        abs_pos: usize,
7695        current_abs_end: usize,
7696        profile: HcOptimalCostProfile,
7697        query: HcCandidateQuery,
7698        out: &mut Vec<MatchCandidate>,
7699    ) {
7700        collect_optimal_candidates_initialized_body!(
7701            self,
7702            S,
7703            abs_pos,
7704            current_abs_end,
7705            profile,
7706            query,
7707            out,
7708            USE_BT_MATCHFINDER,
7709            bt_update_tree_until_avx2_bmi2,
7710            bt_insert_and_collect_matches_avx2_bmi2,
7711            for_each_repcode_candidate_with_reps_avx2_bmi2,
7712            hash3_candidate_avx2_bmi2,
7713            crate::encoding::fastpath::avx2_bmi2::common_prefix_len_ptr,
7714        )
7715    }
7716
7717    #[cfg(not(all(target_arch = "aarch64", target_endian = "little")))]
7718    // Macro emits `unsafe { }` wrappers for NEON/AVX/SSE variants; scalar
7719    // callees are safe so the blocks are redundant here only.
7720    #[allow(unused_unsafe)]
7721    fn collect_optimal_candidates_initialized_scalar<
7722        S: super::strategy::Strategy,
7723        const USE_BT_MATCHFINDER: bool,
7724    >(
7725        &mut self,
7726        abs_pos: usize,
7727        current_abs_end: usize,
7728        profile: HcOptimalCostProfile,
7729        query: HcCandidateQuery,
7730        out: &mut Vec<MatchCandidate>,
7731    ) {
7732        collect_optimal_candidates_initialized_body!(
7733            self,
7734            S,
7735            abs_pos,
7736            current_abs_end,
7737            profile,
7738            query,
7739            out,
7740            USE_BT_MATCHFINDER,
7741            bt_update_tree_until_scalar,
7742            bt_insert_and_collect_matches_scalar,
7743            for_each_repcode_candidate_with_reps_scalar,
7744            hash3_candidate_scalar,
7745            crate::encoding::fastpath::scalar::common_prefix_len_ptr,
7746        )
7747    }
7748}
7749
7750#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
7751#[test]
7752fn matches() {
7753    let mut matcher = MatchGenerator::new(1000);
7754    let mut original_data = Vec::new();
7755    let mut reconstructed = Vec::new();
7756
7757    let replay_sequence = |seq: Sequence<'_>, reconstructed: &mut Vec<u8>| match seq {
7758        Sequence::Literals { literals } => {
7759            assert!(!literals.is_empty());
7760            reconstructed.extend_from_slice(literals);
7761        }
7762        Sequence::Triple {
7763            literals,
7764            offset,
7765            match_len,
7766        } => {
7767            assert!(offset > 0);
7768            assert!(match_len >= MIN_MATCH_LEN);
7769            reconstructed.extend_from_slice(literals);
7770            assert!(offset <= reconstructed.len());
7771            let start = reconstructed.len() - offset;
7772            for i in 0..match_len {
7773                let byte = reconstructed[start + i];
7774                reconstructed.push(byte);
7775            }
7776        }
7777    };
7778
7779    matcher.add_data(
7780        alloc::vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
7781        SuffixStore::with_capacity(100),
7782        |_, _| {},
7783    );
7784    original_data.extend_from_slice(&[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
7785
7786    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7787
7788    assert!(!matcher.next_sequence(|_| {}));
7789
7790    matcher.add_data(
7791        alloc::vec![
7792            1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 0, 0, 0, 0, 0,
7793        ],
7794        SuffixStore::with_capacity(100),
7795        |_, _| {},
7796    );
7797    original_data.extend_from_slice(&[
7798        1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 0, 0, 0, 0, 0,
7799    ]);
7800
7801    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7802    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7803    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7804    assert!(!matcher.next_sequence(|_| {}));
7805
7806    matcher.add_data(
7807        alloc::vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 0, 0, 0, 0],
7808        SuffixStore::with_capacity(100),
7809        |_, _| {},
7810    );
7811    original_data.extend_from_slice(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 0, 0, 0, 0]);
7812
7813    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7814    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7815    assert!(!matcher.next_sequence(|_| {}));
7816
7817    matcher.add_data(
7818        alloc::vec![0, 0, 0, 0, 0],
7819        SuffixStore::with_capacity(100),
7820        |_, _| {},
7821    );
7822    original_data.extend_from_slice(&[0, 0, 0, 0, 0]);
7823
7824    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7825    assert!(!matcher.next_sequence(|_| {}));
7826
7827    matcher.add_data(
7828        alloc::vec![7, 8, 9, 10, 11],
7829        SuffixStore::with_capacity(100),
7830        |_, _| {},
7831    );
7832    original_data.extend_from_slice(&[7, 8, 9, 10, 11]);
7833
7834    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7835    assert!(!matcher.next_sequence(|_| {}));
7836
7837    matcher.add_data(
7838        alloc::vec![1, 3, 5, 7, 9],
7839        SuffixStore::with_capacity(100),
7840        |_, _| {},
7841    );
7842    matcher.skip_matching();
7843    original_data.extend_from_slice(&[1, 3, 5, 7, 9]);
7844    reconstructed.extend_from_slice(&[1, 3, 5, 7, 9]);
7845    assert!(!matcher.next_sequence(|_| {}));
7846
7847    matcher.add_data(
7848        alloc::vec![1, 3, 5, 7, 9],
7849        SuffixStore::with_capacity(100),
7850        |_, _| {},
7851    );
7852    original_data.extend_from_slice(&[1, 3, 5, 7, 9]);
7853
7854    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7855    assert!(!matcher.next_sequence(|_| {}));
7856
7857    matcher.add_data(
7858        alloc::vec![0, 0, 11, 13, 15, 17, 20, 11, 13, 15, 17, 20, 21, 23],
7859        SuffixStore::with_capacity(100),
7860        |_, _| {},
7861    );
7862    original_data.extend_from_slice(&[0, 0, 11, 13, 15, 17, 20, 11, 13, 15, 17, 20, 21, 23]);
7863
7864    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7865    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7866    assert!(!matcher.next_sequence(|_| {}));
7867
7868    assert_eq!(reconstructed, original_data);
7869}
7870
7871#[test]
7872fn dfast_matches_roundtrip_multi_block_pattern() {
7873    let pattern = [9, 21, 44, 184, 19, 96, 171, 109, 141, 251];
7874    let first_block: Vec<u8> = pattern.iter().copied().cycle().take(128 * 1024).collect();
7875    let second_block: Vec<u8> = pattern.iter().copied().cycle().take(128 * 1024).collect();
7876
7877    let mut matcher = DfastMatchGenerator::new(1 << 22);
7878    let replay_sequence = |decoded: &mut Vec<u8>, seq: Sequence<'_>| match seq {
7879        Sequence::Literals { literals } => decoded.extend_from_slice(literals),
7880        Sequence::Triple {
7881            literals,
7882            offset,
7883            match_len,
7884        } => {
7885            decoded.extend_from_slice(literals);
7886            let start = decoded.len() - offset;
7887            for i in 0..match_len {
7888                let byte = decoded[start + i];
7889                decoded.push(byte);
7890            }
7891        }
7892    };
7893
7894    matcher.add_data(first_block.clone(), |_| {});
7895    let mut history = Vec::new();
7896    matcher.start_matching(|seq| replay_sequence(&mut history, seq));
7897    assert_eq!(history, first_block);
7898
7899    matcher.add_data(second_block.clone(), |_| {});
7900    let prefix_len = history.len();
7901    matcher.start_matching(|seq| replay_sequence(&mut history, seq));
7902
7903    assert_eq!(&history[prefix_len..], second_block.as_slice());
7904}
7905
7906/// Regression for the `DFAST_MIN_MATCH_LEN: 6 -> 5` drop. The fixture
7907/// is built so the longest available match is EXACTLY 5 bytes — a
7908/// matcher that still effectively requires a 6-byte floor would emit
7909/// only literals here and the assertion would catch the silent
7910/// 5-byte miss.
7911///
7912/// Fixture layout (34 B):
7913///   bytes 0..5    `"ABCDE"`  — match source
7914///   bytes 5..28   `'!'` × 23 — filler that does NOT start with 'A'
7915///   bytes 28..33  `"ABCDE"`  — match site (repeats the prefix)
7916///   byte  33      `'F'`      — terminator: differs from byte 5 (`'!'`),
7917///                              so the forward extension at the match
7918///                              site stops at exactly length 5.
7919///
7920/// A 5-byte match at offset 28 must be emitted; a 6-byte+ match at the
7921/// same offset must NOT.
7922#[test]
7923fn dfast_accepts_exact_five_byte_match() {
7924    // Layout the input so that:
7925    //   byte  0      = 'Z'            (lead byte — keeps the match SOURCE off
7926    //                                  position 0, which the greedy loop never
7927    //                                  inserts: like the upstream zstd it starts the
7928    //                                  cursor at ip+1 and hashes only visited
7929    //                                  positions)
7930    //   bytes 1..6   = "ABCDE"        (the match source — position 1 IS visited)
7931    //   bytes 6..29  = 23 filler bytes that do NOT start with 'A'
7932    //   bytes 29..34 = "ABCDE"        (the 5-byte match site)
7933    //   byte  34     = 'F'            (differs from byte 6 = '!')
7934    // The longest available copy at position 29 is exactly 5 bytes:
7935    // the byte at position 34 ('F') differs from the byte at position 6
7936    // ('!'), so the forward extension stops at length 5.
7937    let mut data = Vec::new();
7938    data.push(b'Z'); // 0
7939    data.extend_from_slice(b"ABCDE"); // 1..6
7940    data.extend_from_slice(b"!!!!!!!!!!!!!!!!!!!!!!!"); // 6..29 (23 bytes)
7941    data.extend_from_slice(b"ABCDE"); // 29..34
7942    data.push(b'F'); // 34: forces forward extension to stop at length 5
7943    // Trailing filler so the match site (29) sits at least HASH_READ_SIZE (8)
7944    // bytes before the block end. The greedy double-fast — like the upstream zstd —
7945    // stops probing at `ilimit = iend - HASH_READ_SIZE`, so a match in the
7946    // final 8 bytes is never searched (upstream zstd parity, not a regression).
7947    data.extend_from_slice(b"GHIJKLMNOPQRSTUVWXYZ"); // 35..55
7948    assert_eq!(data.len(), 55);
7949
7950    let mut matcher = DfastMatchGenerator::new(1 << 22);
7951    matcher.add_data(data.clone(), |_| {});
7952
7953    let mut saw_five_byte_match = false;
7954    let mut saw_longer_match = false;
7955    matcher.start_matching(|seq| {
7956        if let Sequence::Triple {
7957            offset, match_len, ..
7958        } = seq
7959        {
7960            if offset == 28 && match_len == 5 {
7961                saw_five_byte_match = true;
7962            } else if offset == 28 && match_len > 5 {
7963                saw_longer_match = true;
7964            }
7965        }
7966    });
7967
7968    assert!(
7969        saw_five_byte_match,
7970        "dfast must accept the exact-5-byte match — a 6-byte floor would skip it"
7971    );
7972    assert!(
7973        !saw_longer_match,
7974        "fixture pinned to length 5 — byte 33 ('F') must terminate the extension"
7975    );
7976}
7977
7978#[test]
7979fn driver_switches_backends_and_initializes_dfast_via_reset() {
7980    let mut driver = MatchGeneratorDriver::new(32, 2);
7981
7982    driver.reset(CompressionLevel::Default);
7983    assert_eq!(driver.active_backend(), super::strategy::BackendTag::Dfast);
7984    assert_eq!(driver.window_size(), (1u64 << 21));
7985
7986    let mut first = driver.get_next_space();
7987    first[..12].copy_from_slice(b"abcabcabcabc");
7988    first.truncate(12);
7989    driver.commit_space(first);
7990    assert_eq!(driver.get_last_space(), b"abcabcabcabc");
7991    driver.skip_matching_with_hint(None);
7992
7993    let mut second = driver.get_next_space();
7994    second[..12].copy_from_slice(b"abcabcabcabc");
7995    second.truncate(12);
7996    driver.commit_space(second);
7997
7998    let mut reconstructed = b"abcabcabcabc".to_vec();
7999    driver.start_matching(|seq| match seq {
8000        Sequence::Literals { literals } => reconstructed.extend_from_slice(literals),
8001        Sequence::Triple {
8002            literals,
8003            offset,
8004            match_len,
8005        } => {
8006            reconstructed.extend_from_slice(literals);
8007            let start = reconstructed.len() - offset;
8008            for i in 0..match_len {
8009                let byte = reconstructed[start + i];
8010                reconstructed.push(byte);
8011            }
8012        }
8013    });
8014    assert_eq!(reconstructed, b"abcabcabcabcabcabcabcabc");
8015
8016    driver.reset(CompressionLevel::Fastest);
8017    assert_eq!(driver.window_size(), (1u64 << 19));
8018}
8019
8020#[test]
8021fn driver_level5_selects_row_backend() {
8022    let mut driver = MatchGeneratorDriver::new(32, 2);
8023    driver.reset(CompressionLevel::Level(5));
8024    assert_eq!(driver.active_backend(), super::strategy::BackendTag::Row);
8025    // Greedy-specific routing assertion: `MatchGeneratorDriver::start_matching`
8026    // dispatches the Row backend into `start_matching_greedy` iff
8027    // `self.parse == ParseMode::Greedy`, so assert that actual selector —
8028    // round-trip alone passes on the lazy parser too. `row_matcher().lazy_depth`
8029    // is a secondary corroboration of the same routing decision (a mirror of
8030    // the parse mode); checking `parse` directly catches a regression even if
8031    // the two ever drift apart.
8032    assert_eq!(
8033        driver.parse,
8034        super::strategy::ParseMode::Greedy,
8035        "L5 must route to start_matching_greedy (parse == Greedy)",
8036    );
8037    assert_eq!(
8038        driver.row_matcher().lazy_depth,
8039        0,
8040        "row matcher lazy_depth must mirror the greedy parse mode",
8041    );
8042}
8043
8044/// Level 4 maps to `StrategyTag::Dfast` (the greedy double-fast, upstream zstd
8045/// `ZSTD_dfast` — "greedy" is the parse discipline, not the Row/Greedy
8046/// strategy at Level 5). Round-trip alone doesn't pin match quality (a lazy
8047/// parser would also reconstruct the input correctly), so this test guards the
8048/// parse output itself: a small repeating pattern must produce at least one
8049/// `Sequence::Triple`, so a future regression that emits literals-only (e.g. a
8050/// `min_match` or rep-probe guard regression) is caught.
8051#[test]
8052fn driver_level4_greedy_round_trip_single_slice() {
8053    let mut driver = MatchGeneratorDriver::new(64, 2);
8054    driver.reset(CompressionLevel::Level(4));
8055    let input = b"abcdefgh_abcdefgh_abcdefgh_abcdefgh";
8056    let mut space = driver.get_next_space();
8057    space[..input.len()].copy_from_slice(input);
8058    space.truncate(input.len());
8059    driver.commit_space(space);
8060
8061    let mut reconstructed: Vec<u8> = Vec::new();
8062    let mut saw_triple = false;
8063    driver.start_matching(|seq| match seq {
8064        Sequence::Literals { literals } => reconstructed.extend_from_slice(literals),
8065        Sequence::Triple {
8066            literals,
8067            offset,
8068            match_len,
8069        } => {
8070            saw_triple = true;
8071            reconstructed.extend_from_slice(literals);
8072            let start = reconstructed.len() - offset;
8073            for i in 0..match_len {
8074                let byte = reconstructed[start + i];
8075                reconstructed.push(byte);
8076            }
8077        }
8078    });
8079    assert_eq!(
8080        reconstructed,
8081        input.to_vec(),
8082        "L4 greedy parse failed to reconstruct repeating-pattern input",
8083    );
8084    assert!(
8085        saw_triple,
8086        "L4 greedy parse on a repeating pattern must emit at least one match (Triple)",
8087    );
8088}
8089
8090#[test]
8091fn driver_level4_greedy_round_trip_cross_slice() {
8092    // Verifies that the greedy parse carries repcode / hash-table state
8093    // across slice boundaries: the second slice repeats the first byte
8094    // for byte, so the parse must pick up matches reaching back into
8095    // the previous slice's history.
8096    let mut driver = MatchGeneratorDriver::new(32, 4);
8097    driver.reset(CompressionLevel::Level(4));
8098    let chunk = b"the quick brown fox jumps over!!";
8099    assert_eq!(chunk.len(), 32);
8100
8101    let mut first = driver.get_next_space();
8102    first[..chunk.len()].copy_from_slice(chunk);
8103    first.truncate(chunk.len());
8104    driver.commit_space(first);
8105
8106    let mut first_recon: Vec<u8> = Vec::new();
8107    driver.start_matching(|seq| match seq {
8108        Sequence::Literals { literals } => first_recon.extend_from_slice(literals),
8109        Sequence::Triple {
8110            literals,
8111            offset,
8112            match_len,
8113        } => {
8114            first_recon.extend_from_slice(literals);
8115            let start = first_recon.len() - offset;
8116            for i in 0..match_len {
8117                let byte = first_recon[start + i];
8118                first_recon.push(byte);
8119            }
8120        }
8121    });
8122    assert_eq!(
8123        first_recon,
8124        chunk.to_vec(),
8125        "first slice failed to round-trip"
8126    );
8127
8128    let mut second = driver.get_next_space();
8129    second[..chunk.len()].copy_from_slice(chunk);
8130    second.truncate(chunk.len());
8131    driver.commit_space(second);
8132
8133    let mut full = first_recon.clone();
8134    let mut saw_cross_slice_match = false;
8135    driver.start_matching(|seq| match seq {
8136        Sequence::Literals { literals } => full.extend_from_slice(literals),
8137        Sequence::Triple {
8138            literals,
8139            offset,
8140            match_len,
8141        } => {
8142            // A match whose offset reaches >= the current slice's literal
8143            // run plus the second slice's index means we matched into the
8144            // first slice — exactly the cross-slice behavior under test.
8145            if offset >= chunk.len() {
8146                saw_cross_slice_match = true;
8147            }
8148            full.extend_from_slice(literals);
8149            let start = full.len() - offset;
8150            for i in 0..match_len {
8151                let byte = full[start + i];
8152                full.push(byte);
8153            }
8154        }
8155    });
8156    let mut expected = chunk.to_vec();
8157    expected.extend_from_slice(chunk);
8158    assert_eq!(
8159        full, expected,
8160        "cross-slice L4 greedy parse failed to reconstruct"
8161    );
8162    assert!(
8163        saw_cross_slice_match,
8164        "L4 greedy parse must match across slice boundaries (history is shared)",
8165    );
8166}
8167
8168/// Helper: round-trip `data` through the L4 greedy parse and assert
8169/// the reconstructed bytes match. Returns `(triple_count, max_offset)`
8170/// so callers can probe parse shape (matches emitted, max-offset).
8171#[cfg(test)]
8172impl MatchGeneratorDriver {
8173    /// Test-only: stage a parse×search recipe override applied on the
8174    /// next `reset()`. Routes a level through a non-default (parse,
8175    /// search) pair so the decoupling can be exercised end-to-end.
8176    pub(crate) fn set_config_override(
8177        &mut self,
8178        search: super::strategy::SearchMethod,
8179        parse: super::strategy::ParseMode,
8180    ) {
8181        self.config_override = Some((search, parse));
8182    }
8183
8184    /// Test-only: reset `level` routed onto the lazy HashChain pairing.
8185    /// The lazy band runs on the Row backend in production, so HC-specific
8186    /// behaviour (live-chain dict prime, eviction budget accounting, seed
8187    /// pass gates) is exercised through this override-backed reset.
8188    pub(crate) fn reset_on_hc_lazy(&mut self, level: CompressionLevel) {
8189        self.set_config_override(
8190            super::strategy::SearchMethod::HashChain,
8191            super::strategy::ParseMode::Lazy2,
8192        );
8193        self.reset(level);
8194    }
8195}
8196
8197/// Drive a full compress parse for `data` at `level` (optionally with a
8198/// parse×search override) and reconstruct the bytes from the emitted
8199/// sequences. The returned buffer must equal `data` for a correct parse.
8200#[cfg(test)]
8201fn drive_roundtrip_with_override(
8202    level: CompressionLevel,
8203    over: Option<(super::strategy::SearchMethod, super::strategy::ParseMode)>,
8204    data: &[u8],
8205) -> Vec<u8> {
8206    let mut driver = MatchGeneratorDriver::new(1 << 17, 8);
8207    if let Some((s, p)) = over {
8208        driver.set_config_override(s, p);
8209    }
8210    driver.reset(level);
8211
8212    let mut out: Vec<u8> = Vec::with_capacity(data.len());
8213    let mut offset_in_data = 0usize;
8214    while offset_in_data < data.len() {
8215        let mut space = driver.get_next_space();
8216        let take = (data.len() - offset_in_data).min(space.len());
8217        space[..take].copy_from_slice(&data[offset_in_data..offset_in_data + take]);
8218        space.truncate(take);
8219        driver.commit_space(space);
8220        offset_in_data += take;
8221
8222        driver.start_matching(|seq| match seq {
8223            Sequence::Literals { literals } => out.extend_from_slice(literals),
8224            Sequence::Triple {
8225                literals,
8226                offset,
8227                match_len,
8228            } => {
8229                out.extend_from_slice(literals);
8230                let start = out.len() - offset;
8231                for i in 0..match_len {
8232                    let byte = out[start + i];
8233                    out.push(byte);
8234                }
8235            }
8236        });
8237    }
8238    out
8239}
8240
8241/// Phase 1 capability proof: parse and search are decoupled, so a level
8242/// can run any parse mode on any non-opt search backend. Greedy-on-
8243/// HashChain and Lazy2-on-RowHash are pairings the legacy `strategy_tag`
8244/// could not express; both must reconstruct the input exactly.
8245#[test]
8246fn parse_search_matrix_decoupled_roundtrips() {
8247    use super::strategy::{ParseMode, SearchMethod};
8248    // Mixed repetitive + literal payload that exercises matches and reps.
8249    let mut data = Vec::new();
8250    for i in 0..4000u32 {
8251        data.extend_from_slice(b"the quick brown fox ");
8252        data.extend_from_slice(&i.to_le_bytes());
8253    }
8254
8255    // Greedy parse on the HashChain search backend (legacy: Greedy was
8256    // welded to RowHash).
8257    let got = drive_roundtrip_with_override(
8258        CompressionLevel::Level(5),
8259        Some((SearchMethod::HashChain, ParseMode::Greedy)),
8260        &data,
8261    );
8262    assert_eq!(got, data, "greedy-on-hashchain diverged");
8263
8264    // Lazy2 parse on the RowHash search backend (legacy: Lazy was welded
8265    // to HashChain).
8266    let got = drive_roundtrip_with_override(
8267        CompressionLevel::Level(8),
8268        Some((SearchMethod::RowHash, ParseMode::Lazy2)),
8269        &data,
8270    );
8271    assert_eq!(got, data, "lazy2-on-rowhash diverged");
8272
8273    // Lazy on RowHash too (depth 1).
8274    let got = drive_roundtrip_with_override(
8275        CompressionLevel::Level(6),
8276        Some((SearchMethod::RowHash, ParseMode::Lazy)),
8277        &data,
8278    );
8279    assert_eq!(got, data, "lazy-on-rowhash diverged");
8280}
8281
8282/// The row `mls` knob (C-like `minMatch`) is respected: every accepted
8283/// match (regular row + repcode, on the lazy parse) is at least `mls`
8284/// bytes, and the stream still round-trips for the whole 4..=7 range. The
8285/// default (5) reproduces the historical `ROW_MIN_MATCH_LEN` behaviour.
8286#[test]
8287fn row_mls_knob_gates_matches_and_roundtrips() {
8288    let data: Vec<u8> = (0..4000u32)
8289        .flat_map(|i| {
8290            let mut v = b"abcdefgh".to_vec();
8291            v.extend_from_slice(&i.to_le_bytes());
8292            v
8293        })
8294        .collect();
8295
8296    for mls in [4usize, 5, 6, 7] {
8297        let mut matcher = RowMatchGenerator::new(1 << 22);
8298        let mut cfg = ROW_CONFIG;
8299        cfg.mls = mls;
8300        matcher.configure(cfg);
8301        matcher.add_data(data.clone(), |_| {});
8302
8303        let mut out: Vec<u8> = Vec::with_capacity(data.len());
8304        let mut shortest_match = usize::MAX;
8305        matcher.start_matching(|seq| match seq {
8306            Sequence::Literals { literals } => out.extend_from_slice(literals),
8307            Sequence::Triple {
8308                literals,
8309                offset,
8310                match_len,
8311            } => {
8312                out.extend_from_slice(literals);
8313                shortest_match = shortest_match.min(match_len);
8314                let start = out.len() - offset;
8315                for i in 0..match_len {
8316                    let byte = out[start + i];
8317                    out.push(byte);
8318                }
8319            }
8320        });
8321
8322        assert_eq!(out, data, "mls={mls} round-trip diverged");
8323        if shortest_match != usize::MAX {
8324            assert!(
8325                shortest_match >= mls,
8326                "mls={mls}: emitted a {shortest_match}-byte match below the floor",
8327            );
8328        }
8329    }
8330}
8331
8332/// `LevelParams::parse()` derives the parse mode from the `search` axis, not
8333/// the strategy tag, so the decoupling holds even for a `Bt*`-tagged level
8334/// overridden to a non-BT search backend. Pre-fix the method matched on
8335/// `strategy_tag` and returned `Optimal` for any `Bt*` tag regardless of
8336/// `search`/`lazy_depth`.
8337#[test]
8338fn parse_mode_follows_search_axis_not_strategy_tag() {
8339    use super::strategy::{ParseMode, SearchMethod};
8340    // LEVEL_TABLE[15] is level 16: BtOpt tag, BinaryTree search.
8341    let mut p = LEVEL_TABLE[15];
8342    assert_eq!(p.parse(), ParseMode::Optimal, "BinaryTree search → Optimal");
8343    // Override the Bt-tagged level's search to a non-BT backend: parse must
8344    // follow the search axis (derive from lazy_depth), not stay Optimal.
8345    p.search = SearchMethod::RowHash;
8346    p.lazy_depth = 0;
8347    assert_eq!(p.parse(), ParseMode::Greedy, "RowHash + depth 0 → Greedy");
8348    p.lazy_depth = 2;
8349    assert_eq!(p.parse(), ParseMode::Lazy2, "RowHash + depth 2 → Lazy2");
8350}
8351
8352/// The test-only `config_override` is consumed by the first `reset()` (one
8353/// shot), so a reused driver does not silently keep the synthetic pairing
8354/// armed across later resets. Pre-fix `reset()` copied the override and left
8355/// it set.
8356#[test]
8357fn config_override_is_consumed_by_reset() {
8358    use super::strategy::{ParseMode, SearchMethod};
8359    let mut driver = MatchGeneratorDriver::new(1 << 17, 8);
8360    driver.set_config_override(SearchMethod::RowHash, ParseMode::Lazy2);
8361    assert!(driver.config_override.is_some());
8362    driver.reset(CompressionLevel::Level(5));
8363    assert!(
8364        driver.config_override.is_none(),
8365        "override must be consumed after one reset",
8366    );
8367}
8368
8369// Level 4 maps to the greedy Dfast (double-fast) backend — "greedy" here is the
8370// parse discipline (no lazy lookahead, upstream zstd `ZSTD_dfast`), NOT the Row/Greedy
8371// strategy (which is Level 5). This roundtrip is intentional Dfast L4 coverage;
8372// the Row backend is exercised by the `Level(5)` fixtures elsewhere in this file.
8373#[cfg(test)]
8374fn l4_greedy_round_trip(slice_size: usize, max_slices: usize, data: &[u8]) -> (usize, usize) {
8375    let mut driver = MatchGeneratorDriver::new(slice_size, max_slices);
8376    driver.reset(CompressionLevel::Level(4));
8377
8378    let mut reconstructed: Vec<u8> = Vec::with_capacity(data.len());
8379    let mut triple_count = 0usize;
8380    let mut max_offset = 0usize;
8381
8382    // `start_matching` consumes the current pending slice; multi-slice
8383    // payloads require commit + drive per slice so earlier slices'
8384    // bytes actually round-trip out before they're displaced from the
8385    // window.
8386    let mut offset_in_data = 0usize;
8387    while offset_in_data < data.len() {
8388        let mut space = driver.get_next_space();
8389        let space_cap = space.len();
8390        let take = (data.len() - offset_in_data).min(space_cap);
8391        space[..take].copy_from_slice(&data[offset_in_data..offset_in_data + take]);
8392        space.truncate(take);
8393        driver.commit_space(space);
8394        offset_in_data += take;
8395
8396        driver.start_matching(|seq| match seq {
8397            Sequence::Literals { literals } => reconstructed.extend_from_slice(literals),
8398            Sequence::Triple {
8399                literals,
8400                offset,
8401                match_len,
8402            } => {
8403                triple_count += 1;
8404                if offset > max_offset {
8405                    max_offset = offset;
8406                }
8407                reconstructed.extend_from_slice(literals);
8408                let start = reconstructed.len() - offset;
8409                for i in 0..match_len {
8410                    let byte = reconstructed[start + i];
8411                    reconstructed.push(byte);
8412                }
8413            }
8414        });
8415    }
8416
8417    // Empty payload still needs one commit/drive round so the empty-
8418    // input path of `start_matching_greedy` (the `current_len == 0`
8419    // early-return guard) gets exercised.
8420    if data.is_empty() {
8421        let mut space = driver.get_next_space();
8422        space.truncate(0);
8423        driver.commit_space(space);
8424        driver.start_matching(|seq| match seq {
8425            Sequence::Literals { literals } => reconstructed.extend_from_slice(literals),
8426            Sequence::Triple { .. } => panic!("empty input must not emit any matches"),
8427        });
8428    }
8429
8430    assert_eq!(reconstructed, data, "L4 greedy round-trip diverged");
8431    (triple_count, max_offset)
8432}
8433
8434/// CodeRabbit-flagged tail rep-only case: the previous outer-loop
8435/// guard `pos + ROW_MIN_MATCH_LEN <= current_len` (6) meant the last
8436/// 5-byte position was unreachable. The rep probe at `abs_pos + 1`
8437/// only needs 4 bytes of lookahead beyond the probe point, so the
8438/// guard was relaxed to `pos + GREEDY_MIN_LOOKAHEAD <= current_len`
8439/// (5). This test drives the slices separately and asserts a match
8440/// is emitted **from the second slice's parse pass**, so a future
8441/// regression that re-tightens the guard or breaks the cross-slice
8442/// repcode lookup fails the test instead of being masked by
8443/// first-slice matches.
8444#[test]
8445fn driver_level5_greedy_tail_rep_only_reachable() {
8446    // Period-4 first slice locks rep1 = 4 into `offset_hist` by the
8447    // time the parse reaches the slice tail. Second slice is exactly
8448    // 5 bytes ( = `GREEDY_MIN_LOOKAHEAD`) so the outer loop runs
8449    // **once** at `pos = 0`; the regular `row_candidate` requires 6
8450    // bytes from `abs_pos`, which is past the live history, so the
8451    // only viable hit is the `abs_pos + 1` rep probe. `second[0..]`
8452    // is shaped so the rep probe at `abs_pos + 1` finds a 4-byte
8453    // match at offset 4 (`second[1..5] == first[13..16] ++ second[0]
8454    // == "BCDA"`), and `extend_backwards_shared` then absorbs
8455    // `second[0]` into the match (extending one byte back into the
8456    // implicit anchor, no further because anchor itself is the
8457    // current `abs_pos`).
8458    let first: &[u8] = b"ABCDABCDABCDABCD"; // 16 bytes — strict period 4
8459    let second: &[u8] = b"ABCDA"; // 5 bytes — exact GREEDY_MIN_LOOKAHEAD
8460    let mut driver = MatchGeneratorDriver::new(16, 2);
8461    driver.reset(CompressionLevel::Level(5));
8462
8463    let mut first_space = driver.get_next_space();
8464    first_space[..first.len()].copy_from_slice(first);
8465    first_space.truncate(first.len());
8466    driver.commit_space(first_space);
8467    driver.start_matching(|_| {});
8468
8469    let mut second_space = driver.get_next_space();
8470    second_space[..second.len()].copy_from_slice(second);
8471    second_space.truncate(second.len());
8472    driver.commit_space(second_space);
8473
8474    let mut second_slice_triples = 0usize;
8475    driver.start_matching(|seq| {
8476        if matches!(seq, Sequence::Triple { .. }) {
8477            second_slice_triples += 1;
8478        }
8479    });
8480
8481    assert!(
8482        second_slice_triples >= 1,
8483        "tail rep-only position must produce a match in the second slice \
8484         (got {second_slice_triples} triples)",
8485    );
8486}
8487
8488#[test]
8489fn driver_level4_greedy_empty_input_emits_nothing() {
8490    // Empty input: no slices committed → no sequences emitted, no
8491    // panic. Exercises the `current_len == 0` early-return guard at
8492    // the top of `start_matching_greedy`.
8493    let mut driver = MatchGeneratorDriver::new(64, 2);
8494    driver.reset(CompressionLevel::Level(4));
8495    // Commit an empty space so the matcher has SOMETHING to start
8496    // matching on (otherwise `start_matching` panics on the
8497    // `window.back()` unwrap — that's a separate path covered by
8498    // existing reset tests).
8499    let mut space = driver.get_next_space();
8500    space.truncate(0);
8501    driver.commit_space(space);
8502    let mut emitted_anything = false;
8503    driver.start_matching(|_| emitted_anything = true);
8504    assert!(!emitted_anything, "empty slice must not emit any sequences",);
8505}
8506
8507#[test]
8508fn driver_level4_greedy_sub_min_lookahead_input() {
8509    // Input shorter than `GREEDY_MIN_LOOKAHEAD = 5` — the outer loop
8510    // never executes a body iteration; the tail literal path must
8511    // still emit the input bytes as a single `Sequence::Literals`.
8512    let data: &[u8] = b"abcd"; // 4 bytes
8513    let (triples, _) = l4_greedy_round_trip(64, 2, data);
8514    assert_eq!(
8515        triples, 0,
8516        "sub-min-lookahead input must not emit any matches (got {triples})",
8517    );
8518}
8519
8520#[test]
8521fn driver_level4_greedy_incompressible_input() {
8522    // Pseudo-random bytes with no exploitable structure — every
8523    // position is a "miss" in both the rep probe and the row
8524    // candidate. Exercises the miss branch + `SKIP_STRENGTH = 10`
8525    // skip-step grow (irrelevant at this size, but the path runs).
8526    let mut data = alloc::vec::Vec::with_capacity(256);
8527    let mut x: u32 = 0xDEAD_BEEF;
8528    for _ in 0..256 {
8529        x = x.wrapping_mul(1_103_515_245).wrapping_add(12345);
8530        data.push((x >> 16) as u8);
8531    }
8532    let (_triples, _) = l4_greedy_round_trip(64, 8, &data);
8533    // No structural assertion — the test passes if round-trip is
8534    // bit-exact and no panic / debug_assert fires.
8535}
8536
8537#[test]
8538fn driver_level4_greedy_long_literal_run_skip_step_growth() {
8539    // 2 KiB of unstructured bytes drives the literal-run length past
8540    // the `SKIP_STRENGTH = 10` threshold (~1 KiB), so the miss branch
8541    // + per-miss step-grow path in `start_matching_greedy` is
8542    // exercised. This test is a stress smoke — it only asserts
8543    // bit-exact round-trip + no panic / `debug_assert!` fires; it
8544    // does NOT pin the `SKIP_STRENGTH` constant or the per-iteration
8545    // step count (round-trip would still pass on `SKIP_STRENGTH = 6`
8546    // or `= 14` since both produce valid sequences). Pinning the
8547    // exact step growth would require returning step / iteration
8548    // metadata from the parse, which is invasive plumbing for a
8549    // constant that hasn't been re-tuned in months. The value of
8550    // this test is catching panics or correctness regressions on
8551    // long incompressible runs, which is what its existing
8552    // round-trip assertion checks.
8553    let mut data = alloc::vec::Vec::with_capacity(2048);
8554    let mut x: u32 = 0xC0FF_EE00;
8555    for _ in 0..2048 {
8556        x = x.wrapping_mul(0x9E37_79B9).wrapping_add(0xCAFEBABE);
8557        data.push((x >> 24) as u8);
8558    }
8559    let (_triples, _) = l4_greedy_round_trip(512, 8, &data);
8560}
8561
8562#[test]
8563fn driver_level4_greedy_all_zeros_heavy_rep1() {
8564    // All zeros: every position after the first byte has `byte[pos]
8565    // == byte[pos - 1]`, so the rep1 probe at `abs_pos + 1` hits
8566    // immediately and the parse collapses to a single long match.
8567    // Exercises the `cheap rep at +1, full-match length` path.
8568    let data: Vec<u8> = alloc::vec![0u8; 128];
8569    let (triples, max_offset) = l4_greedy_round_trip(64, 8, &data);
8570    assert!(
8571        triples >= 1,
8572        "all-zeros input must produce at least one rep1 match",
8573    );
8574    // The dominant match should reference rep1 (offset 1), since
8575    // every byte at pos matches pos-1. A larger offset would
8576    // indicate the rep1 probe was bypassed.
8577    assert_eq!(
8578        max_offset, 1,
8579        "all-zeros L4 greedy parse should commit at offset 1 (got {max_offset})",
8580    );
8581}
8582
8583/// Periodic-pattern payload covers the steady-state rep-cascade path
8584/// of the greedy parse — the main-loop rep probe at `abs_pos + 1`
8585/// fires every iteration once the period is locked into
8586/// `offset_hist[0]`, and the parse emits a long chain of triples at
8587/// the same offset.
8588#[test]
8589fn driver_level4_greedy_periodic_pattern_rep_cascade() {
8590    let unit: &[u8] = b"alpha_beta_gamma";
8591    assert_eq!(unit.len(), 16);
8592    let mut data: Vec<u8> = Vec::with_capacity(unit.len() * 32);
8593    for _ in 0..32 {
8594        data.extend_from_slice(unit);
8595    }
8596    let (triples, max_offset) = l4_greedy_round_trip(64, 16, &data);
8597    assert!(
8598        triples >= 1,
8599        "periodic 16-byte payload must emit matches (got {triples})",
8600    );
8601    assert!(
8602        max_offset >= 16,
8603        "periodic 16-byte payload must produce at least one offset >= 16 \
8604         (got max_offset = {max_offset})",
8605    );
8606}
8607
8608#[test]
8609fn driver_reset_keeps_strategy_tag_in_sync_with_active_backend() {
8610    use super::strategy::StrategyTag;
8611
8612    fn check(level: CompressionLevel, expected: StrategyTag) {
8613        let mut driver = MatchGeneratorDriver::new(32, 2);
8614        driver.reset(level);
8615        assert_eq!(
8616            driver.strategy_tag, expected,
8617            "strategy_tag wrong for {level:?}"
8618        );
8619        assert_eq!(
8620            driver.strategy_tag.backend(),
8621            driver.active_backend(),
8622            "strategy_tag backend disagrees with active_backend for {level:?}"
8623        );
8624    }
8625
8626    check(CompressionLevel::Level(1), StrategyTag::Fast);
8627    check(CompressionLevel::Level(2), StrategyTag::Fast);
8628    check(CompressionLevel::Level(3), StrategyTag::Dfast);
8629    check(CompressionLevel::Level(4), StrategyTag::Dfast);
8630    check(CompressionLevel::Level(5), StrategyTag::Greedy);
8631    check(CompressionLevel::Level(7), StrategyTag::Lazy);
8632    check(CompressionLevel::Level(12), StrategyTag::Lazy);
8633    check(CompressionLevel::Level(13), StrategyTag::Btlazy2);
8634    check(CompressionLevel::Level(14), StrategyTag::Btlazy2);
8635    check(CompressionLevel::Level(15), StrategyTag::Btlazy2);
8636    check(CompressionLevel::Level(16), StrategyTag::BtOpt);
8637    check(CompressionLevel::Level(18), StrategyTag::BtUltra);
8638    check(CompressionLevel::Level(22), StrategyTag::BtUltra2);
8639    check(CompressionLevel::Fastest, StrategyTag::Fast);
8640    check(CompressionLevel::Default, StrategyTag::Dfast);
8641    check(CompressionLevel::Better, StrategyTag::Lazy);
8642    // `Best` sits on level 13 (the first dominant point of the deep band).
8643    check(CompressionLevel::Best, StrategyTag::Btlazy2);
8644}
8645
8646#[test]
8647fn level_16_17_map_to_btopt_strategy() {
8648    use super::strategy::{BackendTag, StrategyTag};
8649    let p16 = resolve_level_params(CompressionLevel::Level(16), None);
8650    let p17 = resolve_level_params(CompressionLevel::Level(17), None);
8651    assert_eq!(p16.backend(), BackendTag::HashChain);
8652    assert_eq!(p17.backend(), BackendTag::HashChain);
8653    assert_eq!(StrategyTag::for_level(16), StrategyTag::BtOpt);
8654    assert_eq!(StrategyTag::for_level(17), StrategyTag::BtOpt);
8655}
8656
8657#[test]
8658fn level_18_maps_to_btultra_level_19_to_btultra2_strategy() {
8659    use super::strategy::{BackendTag, StrategyTag};
8660    // Upstream zstd `clevels.h` (srcSize > 256 KiB tier): level 18 = `ZSTD_btultra`,
8661    // level 19 = `ZSTD_btultra2`. Level 19 was previously mapped to plain
8662    // btultra, which under-searched (searchLog 6 vs 7) and lost ~3.7% ratio
8663    // on the repo corpus.
8664    let p18 = resolve_level_params(CompressionLevel::Level(18), None);
8665    let p19 = resolve_level_params(CompressionLevel::Level(19), None);
8666    assert_eq!(p18.backend(), BackendTag::HashChain);
8667    assert_eq!(p19.backend(), BackendTag::HashChain);
8668    assert_eq!(StrategyTag::for_level(18), StrategyTag::BtUltra);
8669    assert_eq!(StrategyTag::for_level(19), StrategyTag::BtUltra2);
8670}
8671
8672#[test]
8673fn level_20_22_map_to_btultra2_strategy() {
8674    use super::strategy::{BackendTag, StrategyTag};
8675    for level in 20..=22 {
8676        let params = resolve_level_params(CompressionLevel::Level(level), None);
8677        assert_eq!(params.backend(), BackendTag::HashChain);
8678        assert_eq!(StrategyTag::for_level(level as u8), StrategyTag::BtUltra2);
8679    }
8680}
8681
8682#[test]
8683fn level22_uses_target_length_and_large_input_tables() {
8684    let params = resolve_level_params(CompressionLevel::Level(22), None);
8685    assert_eq!(params.window_log, 27);
8686    let hc = params.hc.unwrap();
8687    assert_eq!(hc.hash_log, 25);
8688    assert_eq!(hc.chain_log, 27);
8689    assert_eq!(hc.search_depth, 1 << 9);
8690    assert_eq!(hc.target_len, 999);
8691}
8692
8693#[test]
8694fn bt_levels_16_to_21_pin_clevels_params() {
8695    // Pins the BT-level (window_log, hash_log, chain_log, search_depth,
8696    // target_len) tuples so the clevels.h alignment cannot silently drift.
8697    // Levels 16-20 mirror upstream `clevels.h` (srcSize > 256 KiB tier,
8698    // search_depth = 1 << searchLog); level 21 intentionally keeps a deeper
8699    // search_depth (512 vs upstream's 128) — it beats C on ratio there and
8700    // the deeper walk is a deliberate ratio-positive divergence.
8701    let expected = [
8702        // (level, window_log, hash_log, chain_log, search_depth, target_len)
8703        (16u8, 22u8, 22usize, 22usize, 32usize, 48usize),
8704        (17, 23, 22, 23, 32, 64),
8705        (18, 23, 22, 23, 64, 64),
8706        (19, 23, 22, 24, 128, 256),
8707        (20, 25, 23, 25, 128, 256),
8708        (21, 26, 24, 24, 512, 256),
8709    ];
8710    for (level, wlog, hlog, clog, sd, tl) in expected {
8711        let p = resolve_level_params(CompressionLevel::Level(level as i32), None);
8712        assert_eq!(p.window_log, wlog, "level {level} window_log");
8713        let hc = p.hc.unwrap();
8714        assert_eq!(hc.hash_log, hlog, "level {level} hash_log");
8715        assert_eq!(hc.chain_log, clog, "level {level} chain_log");
8716        assert_eq!(hc.search_depth, sd, "level {level} search_depth");
8717        assert_eq!(hc.target_len, tl, "level {level} target_len");
8718    }
8719}
8720
8721#[test]
8722fn level22_source_size_hint_uses_btultra2_tiers() {
8723    let p16k = resolve_level_params(CompressionLevel::Level(22), Some(16 * 1024));
8724    assert_eq!(p16k.window_log, 14);
8725    let hc16k = p16k.hc.unwrap();
8726    assert_eq!(hc16k.hash_log, 15);
8727    assert_eq!(hc16k.chain_log, 15);
8728    assert_eq!(hc16k.search_depth, 1 << 10);
8729    assert_eq!(hc16k.target_len, 999);
8730
8731    let p128k = resolve_level_params(CompressionLevel::Level(22), Some(128 * 1024));
8732    assert_eq!(p128k.window_log, 17);
8733    let hc128k = p128k.hc.unwrap();
8734    assert_eq!(hc128k.hash_log, 17);
8735    assert_eq!(hc128k.chain_log, 18);
8736    assert_eq!(hc128k.search_depth, 1 << 11);
8737    assert_eq!(hc128k.target_len, 999);
8738
8739    let p256k = resolve_level_params(CompressionLevel::Level(22), Some(256 * 1024));
8740    assert_eq!(p256k.window_log, 18);
8741    let hc256k = p256k.hc.unwrap();
8742    assert_eq!(hc256k.hash_log, 19);
8743    assert_eq!(hc256k.chain_log, 19);
8744    assert_eq!(hc256k.search_depth, 1 << 13);
8745    assert_eq!(hc256k.target_len, 999);
8746}
8747
8748#[test]
8749fn level22_non_power_of_two_small_source_uses_tier3_params() {
8750    // srcSize 15 027 (<= 16 KB) selects the table[3] btultra2 row; the
8751    // source-size clamp gives windowLog 14 (ceil log2 15027). Pure-Rust
8752    // assertion against the constant tier-3 geometry (no FFI).
8753    let source_size = 15_027u64;
8754    let params = resolve_level_params(CompressionLevel::Level(22), Some(source_size));
8755
8756    let hc = params.hc.unwrap();
8757    assert_eq!(params.window_log, 14);
8758    assert_eq!(hc.chain_log, 15);
8759    assert_eq!(hc.hash_log, 15);
8760    assert_eq!(hc.search_depth, 1 << 10);
8761    assert_eq!(HC_OPT_MIN_MATCH_LEN, 3);
8762    assert_eq!(hc.target_len, 999);
8763}
8764
8765#[test]
8766fn level22_small_source_uses_window_bounded_hash3_log() {
8767    let mut hc = HcMatchGenerator::new(1 << 14);
8768    hc.configure(
8769        BTULTRA2_HC_CONFIG_L22_16K,
8770        super::strategy::StrategyTag::BtUltra2,
8771        14,
8772    );
8773    assert_eq!(hc.table.hash3_log, 14);
8774
8775    hc.configure(
8776        BTULTRA2_HC_CONFIG_L22,
8777        super::strategy::StrategyTag::BtUltra2,
8778        27,
8779    );
8780    assert_eq!(hc.table.hash3_log, HC3_HASH_LOG);
8781}
8782
8783#[test]
8784fn btultra2_seed_pass_initializes_opt_state() {
8785    let mut hc = HcMatchGenerator::new(1 << 20);
8786    hc.configure(
8787        BTULTRA2_HC_CONFIG,
8788        super::strategy::StrategyTag::BtUltra2,
8789        26,
8790    );
8791    let data: Vec<u8> = (0..32 * 1024).map(|i| (i % 251) as u8).collect();
8792    hc.table.add_data(data, |_| {});
8793    hc.start_matching(|_| {});
8794    assert!(
8795        hc.backend.bt_mut().opt_state.lit_length_sum > 0,
8796        "btultra2 first block should seed non-zero sequence statistics"
8797    );
8798    assert!(
8799        hc.backend.bt_mut().opt_state.off_code_sum > 0,
8800        "btultra2 first block should seed offset-code statistics"
8801    );
8802}
8803
8804#[test]
8805fn btultra2_profile_disables_small_offset_handicap() {
8806    // Pre-Phase-3 this test duplicated the profile build with
8807    // `pass2=false` and `pass2=true` since `for_mode` differentiated
8808    // them. With `const_for_strategy::<BtUltra2>()` there is only one
8809    // profile — the upstream zstd `opt2` pricing — so a single binding
8810    // captures the invariant the test is asserting.
8811    let profile = HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra2>();
8812    assert!(
8813        !profile.favor_small_offsets,
8814        "btultra2 should match upstream zstd opt2 offset pricing"
8815    );
8816    assert!(
8817        profile.accurate,
8818        "btultra2 should use upstream zstd opt2 accurate pricing"
8819    );
8820}
8821
8822#[test]
8823fn btultra_profile_keeps_search_depth_budget() {
8824    let p = HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra>();
8825    assert_eq!(
8826        p.max_chain_depth, 64,
8827        "btultra chain-depth budget must match clevels.h level 18 searchLog 6 (1 << 6 = 64)"
8828    );
8829}
8830
8831#[test]
8832fn btopt_profile_keeps_search_depth_budget() {
8833    let p = HcOptimalCostProfile::const_for_strategy::<super::strategy::BtOpt>();
8834    assert_eq!(
8835        p.max_chain_depth, 32,
8836        "btopt should not cap chain depth below upstream zstd btopt search budget"
8837    );
8838}
8839
8840#[test]
8841fn sufficient_match_len_is_clamped_by_target_len() {
8842    let mut hc = HcMatchGenerator::new(1 << 20);
8843    hc.configure(
8844        BTULTRA2_HC_CONFIG,
8845        super::strategy::StrategyTag::BtUltra2,
8846        26,
8847    );
8848    hc.hc.target_len = 13;
8849    let profile = HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra2>();
8850    assert_eq!(hc.hc.sufficient_match_len_for_pass(profile), 13);
8851}
8852
8853#[test]
8854fn opt_modes_use_target_len_as_sufficient_len() {
8855    use super::strategy;
8856    let mut hc = HcMatchGenerator::new(1 << 20);
8857    hc.hc.target_len = 57;
8858    let profiles = [
8859        HcOptimalCostProfile::const_for_strategy::<strategy::BtOpt>(),
8860        HcOptimalCostProfile::const_for_strategy::<strategy::BtUltra>(),
8861        HcOptimalCostProfile::const_for_strategy::<strategy::BtUltra2>(),
8862    ];
8863    for profile in profiles {
8864        assert_eq!(hc.hc.sufficient_match_len_for_pass(profile), 57);
8865    }
8866}
8867
8868#[test]
8869fn sufficient_match_len_is_capped_by_opt_num() {
8870    let mut hc = HcMatchGenerator::new(1 << 20);
8871    hc.hc.target_len = usize::MAX / 2;
8872    let profile = HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra2>();
8873    assert_eq!(hc.hc.sufficient_match_len_for_pass(profile), HC_OPT_NUM - 1);
8874}
8875
8876#[test]
8877#[allow(clippy::borrow_deref_ref)]
8878fn dictionary_entropy_seed_initializes_opt_state_from_tables() {
8879    let mut hc = HcMatchGenerator::new(1 << 20);
8880    hc.configure(
8881        BTULTRA2_HC_CONFIG,
8882        super::strategy::StrategyTag::BtUltra2,
8883        26,
8884    );
8885
8886    let huff = crate::huff0::huff0_encoder::HuffmanTable::build_from_data(
8887        b"aaabbbbccccddddeeeeefffffgggg",
8888    );
8889    let ll = crate::fse::fse_encoder::default_ll_table();
8890    let ml = crate::fse::fse_encoder::default_ml_table();
8891    let of = crate::fse::fse_encoder::default_of_table();
8892    hc.seed_dictionary_entropy(Some(&huff), Some(&*ll), Some(&*ml), Some(&*of));
8893
8894    hc.backend.bt_mut().opt_state.rescale_freqs(
8895        b"abcd",
8896        HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra2>(),
8897    );
8898
8899    let base_ll_freqs: [u32; HC_MAX_LL + 1] = [
8900        4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8901        1, 1, 1, 1, 1, 1,
8902    ];
8903
8904    assert_ne!(
8905        hc.backend.bt_mut().opt_state.lit_length_freq,
8906        base_ll_freqs,
8907        "dictionary entropy should override fallback LL bootstrap frequencies"
8908    );
8909    assert!(
8910        hc.backend
8911            .bt_mut()
8912            .opt_state
8913            .match_length_freq
8914            .iter()
8915            .any(|&v| v != 1),
8916        "dictionary entropy should seed non-uniform ML frequencies"
8917    );
8918    assert_ne!(
8919        hc.backend.bt_mut().opt_state.off_code_freq[0],
8920        6,
8921        "dictionary entropy should override fallback OF bootstrap frequencies"
8922    );
8923}
8924
8925#[test]
8926#[allow(clippy::borrow_deref_ref)]
8927fn dictionary_fse_seed_applies_without_huffman_seed() {
8928    let mut hc = HcMatchGenerator::new(1 << 20);
8929    hc.configure(
8930        BTULTRA2_HC_CONFIG,
8931        super::strategy::StrategyTag::BtUltra2,
8932        26,
8933    );
8934
8935    let ll = crate::fse::fse_encoder::default_ll_table();
8936    let ml = crate::fse::fse_encoder::default_ml_table();
8937    let of = crate::fse::fse_encoder::default_of_table();
8938    hc.seed_dictionary_entropy(None, Some(&*ll), Some(&*ml), Some(&*of));
8939    hc.backend.bt_mut().opt_state.rescale_freqs(
8940        b"abcd",
8941        HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra2>(),
8942    );
8943
8944    let base_ll_freqs: [u32; HC_MAX_LL + 1] = [
8945        4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8946        1, 1, 1, 1, 1, 1,
8947    ];
8948    assert_ne!(
8949        hc.backend.bt_mut().opt_state.lit_length_freq,
8950        base_ll_freqs,
8951        "FSE seed should still override LL bootstrap frequencies without huffman seed"
8952    );
8953    assert!(
8954        hc.backend
8955            .bt_mut()
8956            .opt_state
8957            .match_length_freq
8958            .iter()
8959            .any(|&v| v != 1),
8960        "FSE seed should still seed non-uniform ML frequencies"
8961    );
8962    assert_ne!(
8963        hc.backend.bt_mut().opt_state.off_code_freq[0],
8964        6,
8965        "FSE seed should still override OF bootstrap frequencies without huffman seed"
8966    );
8967}
8968
8969#[test]
8970#[allow(clippy::borrow_deref_ref)]
8971fn dictionary_seed_overrides_predef_price_mode_on_tiny_input() {
8972    let mut hc = HcMatchGenerator::new(1 << 20);
8973    hc.configure(
8974        BTULTRA2_HC_CONFIG,
8975        super::strategy::StrategyTag::BtUltra2,
8976        26,
8977    );
8978
8979    let ll = crate::fse::fse_encoder::default_ll_table();
8980    let ml = crate::fse::fse_encoder::default_ml_table();
8981    let of = crate::fse::fse_encoder::default_of_table();
8982    hc.seed_dictionary_entropy(None, Some(&*ll), Some(&*ml), Some(&*of));
8983    hc.backend.bt_mut().opt_state.rescale_freqs(
8984        b"abc",
8985        HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra2>(),
8986    );
8987    assert!(
8988        matches!(
8989            hc.backend.bt_mut().opt_state.price_type,
8990            HcOptPriceType::Dynamic
8991        ),
8992        "dictionary-seeded first block should stay in dynamic mode even for tiny src"
8993    );
8994}
8995
8996#[test]
8997fn lit_length_price_blocksize_max_costs_one_extra_bit() {
8998    let profile_predef = HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra2>();
8999    let mut stats_predef = HcOptState::new();
9000    stats_predef.price_type = HcOptPriceType::Predefined;
9001    let predef_max = profile_predef.lit_length_price(&stats_predef, HC_BLOCKSIZE_MAX);
9002    let predef_prev =
9003        profile_predef.lit_length_price(&stats_predef, HC_BLOCKSIZE_MAX.saturating_sub(1));
9004    assert_eq!(
9005        predef_max,
9006        predef_prev + HC_BITCOST_MULTIPLIER,
9007        "predefined litLength pricing at BLOCKSIZE_MAX must add exactly one bit"
9008    );
9009
9010    let profile_dyn = HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra2>();
9011    let mut stats_dyn = HcOptState::new();
9012    stats_dyn.price_type = HcOptPriceType::Dynamic;
9013    stats_dyn.lit_length_freq.fill(1);
9014    stats_dyn.lit_length_sum = (HC_MAX_LL + 1) as u32;
9015    stats_dyn.match_length_freq.fill(1);
9016    stats_dyn.match_length_sum = (HC_MAX_ML + 1) as u32;
9017    stats_dyn.off_code_freq.fill(1);
9018    stats_dyn.off_code_sum = (HC_MAX_OFF + 1) as u32;
9019    stats_dyn.lit_freq.fill(1);
9020    stats_dyn.lit_sum = (HC_MAX_LIT + 1) as u32;
9021    stats_dyn.set_base_prices(true);
9022    let dyn_max = profile_dyn.lit_length_price(&stats_dyn, HC_BLOCKSIZE_MAX);
9023    let dyn_prev = profile_dyn.lit_length_price(&stats_dyn, HC_BLOCKSIZE_MAX.saturating_sub(1));
9024    assert_eq!(
9025        dyn_max,
9026        dyn_prev + HC_BITCOST_MULTIPLIER,
9027        "dynamic litLength pricing at BLOCKSIZE_MAX must add exactly one bit"
9028    );
9029}
9030
9031#[test]
9032#[allow(clippy::borrow_deref_ref)]
9033fn btultra2_seed_pass_disabled_when_dictionary_entropy_seed_present() {
9034    let mut hc = HcMatchGenerator::new(1 << 20);
9035    hc.configure(
9036        BTULTRA2_HC_CONFIG,
9037        super::strategy::StrategyTag::BtUltra2,
9038        26,
9039    );
9040    let ll = crate::fse::fse_encoder::default_ll_table();
9041    let ml = crate::fse::fse_encoder::default_ml_table();
9042    let of = crate::fse::fse_encoder::default_of_table();
9043    hc.seed_dictionary_entropy(None, Some(&*ll), Some(&*ml), Some(&*of));
9044    assert!(
9045        !hc.should_run_btultra2_seed_pass::<super::strategy::BtUltra2>(HC_PREDEF_THRESHOLD + 1),
9046        "dictionary-seeded first block should skip btultra2 warmup pass"
9047    );
9048}
9049
9050#[test]
9051fn btultra2_seed_pass_disabled_when_prefix_history_exists() {
9052    let mut hc = HcMatchGenerator::new(1 << 20);
9053    hc.configure(
9054        BTULTRA2_HC_CONFIG,
9055        super::strategy::StrategyTag::BtUltra2,
9056        26,
9057    );
9058    hc.table.history_abs_start = 17;
9059    hc.table.push_test_chunk(b"abcdefghijklmnop".to_vec());
9060    assert!(
9061        !hc.should_run_btultra2_seed_pass::<super::strategy::BtUltra2>(HC_PREDEF_THRESHOLD + 9),
9062        "btultra2 warmup must be first-block only (no prefix history)"
9063    );
9064}
9065
9066#[test]
9067fn btultra2_seed_pass_disabled_for_tiny_block() {
9068    let mut hc = HcMatchGenerator::new(1 << 20);
9069    hc.configure(
9070        BTULTRA2_HC_CONFIG,
9071        super::strategy::StrategyTag::BtUltra2,
9072        26,
9073    );
9074    assert!(
9075        !hc.should_run_btultra2_seed_pass::<super::strategy::BtUltra2>(HC_PREDEF_THRESHOLD),
9076        "btultra2 warmup should not run at or below predefined threshold"
9077    );
9078}
9079
9080#[test]
9081fn btultra2_seed_pass_disabled_after_stats_initialized() {
9082    let mut hc = HcMatchGenerator::new(1 << 20);
9083    hc.configure(
9084        BTULTRA2_HC_CONFIG,
9085        super::strategy::StrategyTag::BtUltra2,
9086        26,
9087    );
9088    hc.backend.bt_mut().opt_state.lit_length_sum = 1;
9089    assert!(
9090        !hc.should_run_btultra2_seed_pass::<super::strategy::BtUltra2>(HC_PREDEF_THRESHOLD + 32),
9091        "btultra2 warmup should run only for first block before stats are initialized"
9092    );
9093}
9094
9095#[test]
9096fn btultra2_seed_pass_disabled_when_not_at_frame_start() {
9097    let mut hc = HcMatchGenerator::new(1 << 20);
9098    hc.configure(
9099        BTULTRA2_HC_CONFIG,
9100        super::strategy::StrategyTag::BtUltra2,
9101        26,
9102    );
9103    // Simulate non-first block state: current block has no prefix in deque,
9104    // but total produced window already includes prior output.
9105    hc.table.window_size = HC_PREDEF_THRESHOLD + 64;
9106    // window_size set manually above to simulate prior output; record the
9107    // current block as one live chunk (seed-pass check reads lengths, not bytes).
9108    hc.table.chunk_lens.push_back(HC_PREDEF_THRESHOLD + 32);
9109    assert!(
9110        !hc.should_run_btultra2_seed_pass::<super::strategy::BtUltra2>(HC_PREDEF_THRESHOLD + 32),
9111        "btultra2 warmup must not run after frame start"
9112    );
9113}
9114
9115#[test]
9116fn btultra2_seed_pass_disabled_when_ldm_sequences_exist() {
9117    let mut hc = HcMatchGenerator::new(1 << 20);
9118    hc.configure(
9119        BTULTRA2_HC_CONFIG,
9120        super::strategy::StrategyTag::BtUltra2,
9121        26,
9122    );
9123    hc.table.window_size = HC_PREDEF_THRESHOLD + 64;
9124    hc.table.chunk_lens.push_back(HC_PREDEF_THRESHOLD + 64);
9125    hc.backend.bt_mut().ldm_sequences.push(HcRawSeq {
9126        lit_length: 8,
9127        offset: 16,
9128        match_length: 32,
9129    });
9130    assert!(
9131        !hc.should_run_btultra2_seed_pass::<super::strategy::BtUltra2>(HC_PREDEF_THRESHOLD + 32),
9132        "btultra2 warmup must not run when LDM already produced sequences"
9133    );
9134}
9135
9136#[test]
9137fn literal_price_uses_eight_bits_when_literals_uncompressed() {
9138    let profile = HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra2>();
9139    let mut stats = HcOptState::new();
9140    stats.set_literals_compressed_for_tests(false);
9141    stats.price_type = HcOptPriceType::Predefined;
9142    assert_eq!(
9143        profile.literal_price(&stats, b'a'),
9144        8 * HC_BITCOST_MULTIPLIER,
9145        "uncompressed literals should cost 8 bits regardless of price mode"
9146    );
9147}
9148
9149#[test]
9150fn update_stats_skips_literal_frequencies_when_uncompressed() {
9151    let mut stats = HcOptState::new();
9152    stats.set_literals_compressed_for_tests(false);
9153    stats.update_stats(3, b"abc", 4, 8);
9154    assert_eq!(
9155        stats.lit_sum, 0,
9156        "literal sum must remain unchanged when literal compression is disabled"
9157    );
9158    assert_eq!(
9159        stats.lit_freq.iter().copied().sum::<u32>(),
9160        0,
9161        "literal frequencies must not be updated when literal compression is disabled"
9162    );
9163    assert_eq!(
9164        stats.lit_length_sum, 1,
9165        "literal-length stats still update for sequence modeling"
9166    );
9167    assert_eq!(
9168        stats.match_length_sum, 1,
9169        "match-length stats still update for sequence modeling"
9170    );
9171    assert_eq!(
9172        stats.off_code_sum, 1,
9173        "offset-code stats still update for sequence modeling"
9174    );
9175}
9176
9177#[test]
9178#[allow(clippy::borrow_deref_ref)]
9179fn dictionary_huffman_seed_ignored_when_literals_uncompressed() {
9180    let mut stats = HcOptState::new();
9181    stats.set_literals_compressed_for_tests(false);
9182    let huff = crate::huff0::huff0_encoder::HuffmanTable::build_from_data(
9183        b"aaaaabbbbcccddeeff00112233445566778899",
9184    );
9185    let ll = crate::fse::fse_encoder::default_ll_table();
9186    let ml = crate::fse::fse_encoder::default_ml_table();
9187    let of = crate::fse::fse_encoder::default_of_table();
9188    stats.seed_dictionary_entropy(Some(&huff), Some(&*ll), Some(&*ml), Some(&*of));
9189    stats.rescale_freqs(
9190        b"abcd",
9191        HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra2>(),
9192    );
9193    assert_eq!(
9194        stats.lit_sum, 0,
9195        "literal sum must stay zero when literals are uncompressed"
9196    );
9197    assert_eq!(
9198        stats.lit_freq.iter().copied().sum::<u32>(),
9199        0,
9200        "literal frequencies must ignore dictionary huffman seed when uncompressed"
9201    );
9202}
9203
9204#[test]
9205fn hc_repcode_candidates_respect_litlen_dependent_rep_order() {
9206    let mut hc = HcMatchGenerator::new(64);
9207    hc.table.history = b"xxxxxxABCDEFABCDEF".to_vec();
9208    hc.table.history_start = 0;
9209    hc.table.history_abs_start = 0;
9210
9211    let abs_pos = 12usize; // points at second "ABCDEF"
9212    let current_abs_end = hc.table.history.len();
9213    let reps = [6u32, 3u32, 9u32];
9214
9215    let mut lit_pos_candidates = Vec::new();
9216    hc.hc.for_each_repcode_candidate_with_reps(
9217        &hc.table,
9218        abs_pos,
9219        1,
9220        reps,
9221        current_abs_end,
9222        HC_OPT_MIN_MATCH_LEN,
9223        |c| {
9224            lit_pos_candidates.push(c.offset);
9225        },
9226    );
9227    assert!(
9228        lit_pos_candidates.contains(&6),
9229        "when lit_len>0, rep0 should be considered and match"
9230    );
9231
9232    let mut ll0_candidates = Vec::new();
9233    hc.hc.for_each_repcode_candidate_with_reps(
9234        &hc.table,
9235        abs_pos,
9236        0,
9237        reps,
9238        current_abs_end,
9239        HC_OPT_MIN_MATCH_LEN,
9240        |c| {
9241            ll0_candidates.push(c.offset);
9242        },
9243    );
9244    assert!(
9245        !ll0_candidates.contains(&6),
9246        "when lit_len==0, rep0 is not directly eligible (ll0 semantics)"
9247    );
9248}
9249
9250#[test]
9251fn hc_collect_optimal_candidates_keeps_reps_when_chain_depth_zero() {
9252    let mut hc = HcMatchGenerator::new(64);
9253    hc.hc.search_depth = 0;
9254    hc.table.history = b"xyzxyzxyzxyz".to_vec();
9255    hc.table.history_start = 0;
9256    hc.table.history_abs_start = 0;
9257
9258    let abs_pos = 6usize;
9259    let current_abs_end = hc.table.history.len();
9260    let profile = HcOptimalCostProfile {
9261        max_chain_depth: 0,
9262        sufficient_match_len: usize::MAX / 2,
9263        accurate: false,
9264        favor_small_offsets: false,
9265    };
9266    let mut out = Vec::new();
9267    hc.collect_optimal_candidates(
9268        abs_pos,
9269        current_abs_end,
9270        profile,
9271        HcCandidateQuery {
9272            reps: [3, 6, 9],
9273            lit_len: 1,
9274            ldm_candidate: None,
9275        },
9276        &mut out,
9277    );
9278    assert!(
9279        !out.is_empty(),
9280        "rep candidates should remain available even when chain depth is zero"
9281    );
9282    assert!(
9283        out.iter().any(|c| c.offset == 3),
9284        "rep0 candidate should be retained"
9285    );
9286}
9287
9288#[test]
9289fn hc_collect_optimal_candidates_rep_tail_match_skips_chain_probe() {
9290    let mut hc = HcMatchGenerator::new(64);
9291    hc.table.history = b"aaaaaaaaaa".to_vec();
9292    hc.table.history_start = 0;
9293    hc.table.history_abs_start = 0;
9294    hc.table.position_base = 0;
9295    hc.hc.search_depth = 32;
9296    let abs_pos = 6usize;
9297    hc.table.ensure_tables();
9298    hc.table.insert_positions(0, abs_pos);
9299
9300    let profile = HcOptimalCostProfile {
9301        max_chain_depth: 32,
9302        sufficient_match_len: usize::MAX / 2,
9303        accurate: true,
9304        favor_small_offsets: false,
9305    };
9306    let mut out = Vec::new();
9307    hc.collect_optimal_candidates(
9308        abs_pos,
9309        hc.table.history.len(),
9310        profile,
9311        HcCandidateQuery {
9312            reps: [1, 4, 8],
9313            lit_len: 1,
9314            ldm_candidate: None,
9315        },
9316        &mut out,
9317    );
9318
9319    assert!(
9320        out.iter()
9321            .all(|candidate| matches!(candidate.offset, 1 | 4)),
9322        "terminal rep match should return before chain probing adds non-rep offsets"
9323    );
9324}
9325
9326#[test]
9327fn hc_collect_optimal_candidates_long_chain_match_advances_skip_window() {
9328    let mut hc = HcMatchGenerator::new(128);
9329    hc.table.history = b"abcabcabcabcabcabcabcabc".to_vec();
9330    hc.table.history_start = 0;
9331    hc.table.history_abs_start = 0;
9332    hc.table.position_base = 0;
9333    hc.hc.search_depth = 32;
9334    let abs_pos = 9usize;
9335    hc.table.ensure_tables();
9336    hc.table.insert_positions(0, abs_pos);
9337    hc.table.skip_insert_until_abs = 0;
9338
9339    let profile = HcOptimalCostProfile {
9340        max_chain_depth: 32,
9341        sufficient_match_len: usize::MAX / 2,
9342        accurate: true,
9343        favor_small_offsets: false,
9344    };
9345    let mut out = Vec::new();
9346    hc.collect_optimal_candidates(
9347        abs_pos,
9348        hc.table.history.len(),
9349        profile,
9350        HcCandidateQuery {
9351            reps: [1, 4, 8],
9352            lit_len: 1,
9353            ldm_candidate: None,
9354        },
9355        &mut out,
9356    );
9357
9358    assert!(
9359        hc.table.skip_insert_until_abs > abs_pos,
9360        "long chain match should advance skip window to avoid redundant immediate insertions"
9361    );
9362}
9363
9364#[test]
9365fn hc_collect_optimal_candidates_chain_fast_skip_uses_match_end_minus_8() {
9366    let mut hc = HcMatchGenerator::new(128);
9367    hc.table.history = b"abcabcabcabcabcabcabcabc".to_vec();
9368    hc.table.history_start = 0;
9369    hc.table.history_abs_start = 0;
9370    hc.table.position_base = 0;
9371    hc.hc.search_depth = 32;
9372    let abs_pos = 9usize;
9373    hc.table.ensure_tables();
9374    hc.table.insert_positions(0, abs_pos);
9375    hc.table.skip_insert_until_abs = 0;
9376
9377    let profile = HcOptimalCostProfile {
9378        max_chain_depth: 32,
9379        sufficient_match_len: 10,
9380        accurate: true,
9381        favor_small_offsets: false,
9382    };
9383    let mut out = Vec::new();
9384    hc.collect_optimal_candidates(
9385        abs_pos,
9386        hc.table.history.len(),
9387        profile,
9388        HcCandidateQuery {
9389            reps: [1, 4, 8],
9390            lit_len: 1,
9391            ldm_candidate: None,
9392        },
9393        &mut out,
9394    );
9395
9396    let best_match_end = out
9397        .iter()
9398        .map(|candidate| candidate.start.saturating_add(candidate.match_len))
9399        .max()
9400        .expect("expected at least one candidate");
9401    assert!(
9402        hc.table.skip_insert_until_abs > abs_pos,
9403        "chain fast-skip must advance past current position"
9404    );
9405    assert!(
9406        hc.table.skip_insert_until_abs <= best_match_end.saturating_sub(8),
9407        "chain fast-skip must not exceed upstream zstd-style matchEndIdx - 8 bound"
9408    );
9409}
9410
9411#[test]
9412fn hc_collect_optimal_candidates_advances_skip_window_on_plain_bt_path() {
9413    let mut hc = HcMatchGenerator::new(256);
9414    hc.table.history = b"abcdefghijklmnop".to_vec();
9415    hc.table.history_start = 0;
9416    hc.table.history_abs_start = 0;
9417    hc.table.position_base = 0;
9418    hc.hc.search_depth = 0;
9419    hc.table.ensure_tables();
9420
9421    let abs_pos = 8usize;
9422    hc.table.skip_insert_until_abs = 0;
9423
9424    let profile = HcOptimalCostProfile {
9425        max_chain_depth: 0,
9426        sufficient_match_len: usize::MAX / 2,
9427        accurate: true,
9428        favor_small_offsets: false,
9429    };
9430    let mut out = Vec::new();
9431    hc.collect_optimal_candidates(
9432        abs_pos,
9433        hc.table.history.len(),
9434        profile,
9435        HcCandidateQuery {
9436            reps: [1, 4, 8],
9437            lit_len: 1,
9438            ldm_candidate: None,
9439        },
9440        &mut out,
9441    );
9442
9443    assert_eq!(
9444        hc.table.skip_insert_until_abs,
9445        abs_pos.saturating_add(1),
9446        "plain BT path should advance skip window by 1 via upstream zstd matchEndIdx baseline"
9447    );
9448}
9449
9450// Removed: the three `hc_collect_optimal_candidates_*_hash3_*` /
9451// `hc_hash3_tail_match_*` tests forced `search_depth = 0` together
9452// with `hash3_log != 0`, an HC-chain-walker-only fixture state that
9453// production never reaches (hash3 is BtUltra2-only and BtUltra2 always
9454// runs `search_depth = 512`). They depended on the `has_hash3 =>
9455// BtUltra2` escape hatch in the test dispatcher; with that hatch gone
9456// (CR review on PR #123) and the dispatcher routing purely from
9457// `self.strategy_tag`, there is no production-shaped configuration
9458// that reproduces what those tests asserted. The corresponding hash3
9459// invariants are exercised end-to-end by the existing level22 roundtrip
9460// + upstream zstd-parity ratio gate.
9461
9462#[test]
9463fn hc_ldm_candidates_are_merged_into_optimal_candidates() {
9464    let mut hc = HcMatchGenerator::new(512);
9465    hc.table.history = (0..256).map(|i| (i % 251) as u8).collect();
9466    hc.table.history_start = 0;
9467    hc.table.history_abs_start = 0;
9468
9469    let abs_pos = 128usize;
9470    let current_abs_end = 256usize;
9471    let ldm = MatchCandidate {
9472        start: abs_pos,
9473        offset: 96,
9474        match_len: 40,
9475    };
9476
9477    let profile = HcOptimalCostProfile {
9478        max_chain_depth: 0,
9479        sufficient_match_len: usize::MAX / 2,
9480        accurate: true,
9481        favor_small_offsets: false,
9482    };
9483    let mut out = Vec::new();
9484    hc.collect_optimal_candidates(
9485        abs_pos,
9486        current_abs_end,
9487        profile,
9488        HcCandidateQuery {
9489            reps: [1, 4, 8],
9490            lit_len: 1,
9491            ldm_candidate: Some(ldm),
9492        },
9493        &mut out,
9494    );
9495    assert!(
9496        out.iter().any(
9497            |candidate| candidate.offset == ldm.offset && candidate.match_len == ldm.match_len
9498        ),
9499        "LDM candidate should be present in optimal candidate set"
9500    );
9501}
9502
9503#[test]
9504fn btultra_and_btultra2_both_keep_dictionary_candidates() {
9505    // Routes the BtUltra2 / BtUltra fixture through the production
9506    // `configure()` path so derived state (`hash3_log`, `is_btultra2`,
9507    // `uses_bt`, `backend`) stays consistent — manually flipping the
9508    // strategy flags here used to leave `hash3_log` / `hash3_table` in
9509    // the previous mode's shape and trip the
9510    // `Strategy::USE_HASH3 ⇒ hash3_log != 0` debug invariant inside
9511    // `collect_optimal_candidates_initialized_body`.
9512    use super::strategy::StrategyTag;
9513
9514    let test_config = HcConfig {
9515        hash_log: 23,
9516        chain_log: 22,
9517        search_depth: 32,
9518        target_len: 256,
9519        search_mls: 4,
9520    };
9521    let window_log = 20u8;
9522
9523    let prepare_history = |hc: &mut HcMatchGenerator, abs_pos: usize| {
9524        hc.table.history = alloc::vec![0u8; 160];
9525        for i in 0..64 {
9526            hc.table.history[i] = b'a' + (i % 7) as u8;
9527        }
9528        for i in 64..160 {
9529            hc.table.history[i] = b'k' + (i % 5) as u8;
9530        }
9531        for i in 0..24 {
9532            hc.table.history[abs_pos + i] = hc.table.history[16 + i];
9533        }
9534        hc.table.history_start = 0;
9535        hc.table.history_abs_start = 0;
9536        hc.table.position_base = 0;
9537        hc.table.ensure_tables();
9538        hc.table.insert_positions(0, abs_pos);
9539        hc.table.dictionary_limit_abs = Some(64);
9540        hc.table.skip_insert_until_abs = 0;
9541    };
9542
9543    let profile = HcOptimalCostProfile {
9544        max_chain_depth: 32,
9545        sufficient_match_len: usize::MAX / 2,
9546        accurate: true,
9547        favor_small_offsets: false,
9548    };
9549    let abs_pos = 96usize;
9550    let mut out = Vec::new();
9551
9552    let mut hc = HcMatchGenerator::new(256);
9553    hc.configure(test_config, StrategyTag::BtUltra2, window_log);
9554    prepare_history(&mut hc, abs_pos);
9555    hc.collect_optimal_candidates(
9556        abs_pos,
9557        160,
9558        profile,
9559        HcCandidateQuery {
9560            reps: [1, 4, 8],
9561            lit_len: 1,
9562            ldm_candidate: None,
9563        },
9564        &mut out,
9565    );
9566    assert!(
9567        out.iter().any(|candidate| candidate.offset >= 32),
9568        "btultra2 should retain dictionary candidates on upstream zstd-parity path"
9569    );
9570
9571    let mut hc = HcMatchGenerator::new(256);
9572    hc.configure(test_config, StrategyTag::BtUltra, window_log);
9573    prepare_history(&mut hc, abs_pos);
9574    hc.collect_optimal_candidates(
9575        abs_pos,
9576        160,
9577        profile,
9578        HcCandidateQuery {
9579            reps: [1, 4, 8],
9580            lit_len: 1,
9581            ldm_candidate: None,
9582        },
9583        &mut out,
9584    );
9585    assert!(
9586        out.iter().any(|candidate| candidate.offset >= 32),
9587        "btultra should retain dictionary candidates"
9588    );
9589}
9590
9591#[test]
9592fn driver_small_source_hint_shrinks_dfast_hash_tables() {
9593    let mut driver = MatchGeneratorDriver::new(32, 2);
9594
9595    driver.reset(CompressionLevel::Level(3));
9596    let mut space = driver.get_next_space();
9597    space[..12].copy_from_slice(b"abcabcabcabc");
9598    space.truncate(12);
9599    driver.commit_space(space);
9600    driver.skip_matching_with_hint(None);
9601    // Upstream zstd-parity split sizes: long-hash = DFAST_HASH_BITS,
9602    // short-hash = DFAST_HASH_BITS - DFAST_SHORT_HASH_BITS_DELTA.
9603    let full_long = driver.dfast_matcher().long_len();
9604    let full_short = driver.dfast_matcher().short_len();
9605    assert_eq!(full_long, 1 << DFAST_HASH_BITS);
9606    assert_eq!(
9607        full_short,
9608        1 << (DFAST_HASH_BITS - DFAST_SHORT_HASH_BITS_DELTA)
9609    );
9610
9611    driver.set_source_size_hint(1024);
9612    driver.reset(CompressionLevel::Level(3));
9613    let mut space = driver.get_next_space();
9614    space[..12].copy_from_slice(b"xyzxyzxyzxyz");
9615    space.truncate(12);
9616    driver.commit_space(space);
9617    driver.skip_matching_with_hint(None);
9618    let hinted_long = driver.dfast_matcher().long_len();
9619    let hinted_short = driver.dfast_matcher().short_len();
9620
9621    // The wire `window_log` stays at its floor (decoder-interop), but the
9622    // internal dfast tables are sized from the RAW 1 KiB source, not the
9623    // floored window: `table_window = 1 << ceil_log2(1024) = 1 << 10`, so
9624    // both tables land at the `MIN_WINDOW_LOG` floor (the long table at
9625    // `dfast_hash_bits_for_window(1 << 10) = 10`, the short table one
9626    // `DFAST_SHORT_HASH_BITS_DELTA` step below but clamped back up to
9627    // `MIN_WINDOW_LOG`).
9628    assert_eq!(driver.window_size(), 1 << MIN_HINTED_WINDOW_LOG);
9629    assert_eq!(hinted_long, 1 << MIN_WINDOW_LOG);
9630    assert_eq!(hinted_short, 1 << MIN_WINDOW_LOG);
9631    assert!(
9632        hinted_long < full_long && hinted_short < full_short,
9633        "tiny source hint should reduce both dfast tables"
9634    );
9635}
9636
9637#[test]
9638fn driver_huge_source_hint_does_not_overflow_table_window_shift() {
9639    // Regression: the Dfast / Row table-window sizing in `reset` derives a
9640    // shift from `ceil_log2(hint)`. A hint >= 2^63 + 1 makes that shift 64,
9641    // and `1usize << 64` panics in debug / wraps to 0 in release before the
9642    // `.min(max_window_size)` cap can apply. A `u64::MAX` pledged source size
9643    // must size the table to the real window, never panic or wrap to zero.
9644    let mut driver = MatchGeneratorDriver::new(32, 2);
9645    driver.set_source_size_hint(u64::MAX);
9646    driver.reset(CompressionLevel::Level(3));
9647
9648    let mut space = driver.get_next_space();
9649    space[..12].copy_from_slice(b"abcabcabcabc");
9650    space.truncate(12);
9651    driver.commit_space(space);
9652    driver.skip_matching_with_hint(None);
9653
9654    assert!(
9655        driver.dfast_matcher().long_len() >= 1 << MIN_WINDOW_LOG,
9656        "huge hint must size the dfast table from the real window, not wrap to zero"
9657    );
9658}
9659
9660#[test]
9661fn driver_huge_source_hint_with_dict_does_not_overflow_hc_reserve() {
9662    // Regression: the HC/BT history-mirror pre-size adds the dictionary
9663    // hint to the source-size hint before `reserve_history` clamps to the
9664    // window ceiling. A `u64::MAX` pledged source size (the "unknown size"
9665    // sentinel) plus any positive dictionary hint overflows `usize` in
9666    // `(src as usize) + dict_hint` — debug panic / release wrap on 64-bit,
9667    // and `src as usize` truncation on 32-bit targets. Level 16 (BtOpt)
9668    // routes through the HashChain/BT storage arm that owns this reserve.
9669    // Must size the mirror to the real window, never panic, wrap, or
9670    // truncate.
9671    let mut driver = MatchGeneratorDriver::new(32, 2);
9672    driver.set_source_size_hint(u64::MAX);
9673    driver.set_dictionary_size_hint(64 * 1024);
9674    driver.reset(CompressionLevel::Level(16));
9675
9676    // The saturated `usize::MAX` reserve target must be clamped to the HC
9677    // history ceiling, not reserved literally (which would OOM/panic). Level 16
9678    // has window_log 22, so the ceiling is `window + window/4 + one block`
9679    // (the `reserve_history` formula). Assert the reserve actually reached it —
9680    // a no-panic-only check would also pass on an under-reserved mirror.
9681    let window = 1usize << 22;
9682    let expected_history_ceiling = window + (window >> 2) + crate::common::MAX_BLOCK_SIZE as usize;
9683    assert!(
9684        driver.hc_matcher().table.history.capacity() >= expected_history_ceiling,
9685        "huge source + dict hint must reserve the clamped HC history ceiling, got {}",
9686        driver.hc_matcher().table.history.capacity()
9687    );
9688
9689    let mut space = driver.get_next_space();
9690    space[..12].copy_from_slice(b"abcabcabcabc");
9691    space.truncate(12);
9692    driver.commit_space(space);
9693    driver.skip_matching_with_hint(None);
9694}
9695
9696#[test]
9697fn driver_chain_log_override_survives_row_to_hc_fallback() {
9698    // Regression: when a RowHash level is forced onto the HashChain backend
9699    // (resolved window <= 14, upstream `ZSTD_resolveRowMatchFinderMode`), the
9700    // synthesised HC chain table must honour an explicit `chain_log` override.
9701    // The RowHash override arm drops `chain_log` (Row has no chain table), so
9702    // the synthesis previously replaced the caller's `chain_log` with the upstream zstd
9703    // `hashLog - 1`, silently ignoring it on small-window frames.
9704    let chain_log_override = 10u32;
9705    let ov = super::parameters::ParamOverrides {
9706        chain_log: Some(chain_log_override),
9707        ..Default::default()
9708    };
9709    let mut driver = MatchGeneratorDriver::new(32, 2);
9710    // Small source hint pins the window to the hinted floor (16 KiB =
9711    // windowLog 14), so the Level 6 Row finder falls back to HashChain.
9712    driver.set_source_size_hint(1 << 12);
9713    driver.set_param_overrides(Some(ov));
9714    driver.reset(CompressionLevel::Level(6));
9715    let mut space = driver.get_next_space();
9716    space[..12].copy_from_slice(b"abcabcabcabc");
9717    space.truncate(12);
9718    driver.commit_space(space);
9719    driver.skip_matching_with_hint(None);
9720    // The override (10) is below the window cap (14), so the resolved HC chain
9721    // table must reflect it — NOT the upstream zstd `hashLog - 1` (18, clamped to the
9722    // window 14). Pre-fix this resolved to 14.
9723    assert_eq!(
9724        driver.hc_matcher().table.chain_log,
9725        chain_log_override as usize,
9726        "explicit chain_log override must survive the Row->HC fallback, got {}",
9727        driver.hc_matcher().table.chain_log
9728    );
9729}
9730
9731#[test]
9732fn driver_small_source_hint_shrinks_row_hash_tables() {
9733    let mut driver = MatchGeneratorDriver::new(32, 2);
9734
9735    driver.reset(CompressionLevel::Level(5));
9736    let mut space = driver.get_next_space();
9737    space[..12].copy_from_slice(b"abcabcabcabc");
9738    space.truncate(12);
9739    driver.commit_space(space);
9740    driver.skip_matching_with_hint(None);
9741    let full_rows = driver.row_matcher().row_heads.len();
9742    // Level 5 uses the upstream row_log (clamp(searchLog=3, 4, 6) = 4) and the
9743    // upstream L5 hashLog (`ZSTD_getCParams(5,..).hashLog` = 19), so the row
9744    // count is 1 << (ROW_L5.hash_bits - ROW_L5.row_log).
9745    assert_eq!(full_rows, 1 << (ROW_L5.hash_bits - ROW_L5.row_log));
9746
9747    // A hint that keeps the resolved window > 14 STILL uses the Row finder
9748    // (upstream `ZSTD_resolveRowMatchFinderMode`: row mode on for windowLog > 14)
9749    // and shrinks the row hash table to the source-derived width. 64 KiB →
9750    // raw source log 16, so `row_hash_bits_for_window(1 << 16)` < the level's
9751    // full hash_bits (19) and the row count drops.
9752    driver.set_source_size_hint(1 << 16);
9753    driver.reset(CompressionLevel::Level(5));
9754    let mut space = driver.get_next_space();
9755    space[..12].copy_from_slice(b"xyzxyzxyzxyz");
9756    space.truncate(12);
9757    driver.commit_space(space);
9758    driver.skip_matching_with_hint(None);
9759    assert_eq!(
9760        driver.active_backend(),
9761        super::strategy::BackendTag::Row,
9762        "windowLog > 14 keeps the upstream row matchfinder"
9763    );
9764    let hinted_rows = driver.row_matcher().row_heads.len();
9765    assert!(
9766        hinted_rows < full_rows,
9767        "a window>14 source hint should reduce the row hash table footprint"
9768    );
9769
9770    // A tiny hint floors the resolved window at MIN_HINTED_WINDOW_LOG = 14;
9771    // upstream uses the HASH-CHAIN matcher (not Row) at windowLog <= 14, so the
9772    // driver must route greedy/lazy/lazy2 to the HashChain backend there.
9773    driver.set_source_size_hint(1024);
9774    driver.reset(CompressionLevel::Level(5));
9775    assert_eq!(driver.window_size(), 1 << MIN_HINTED_WINDOW_LOG);
9776    assert_eq!(
9777        driver.active_backend(),
9778        super::strategy::BackendTag::HashChain,
9779        "windowLog <= 14 must fall back to the upstream zstd hash-chain matchfinder",
9780    );
9781}
9782
9783#[test]
9784fn row_matches_roundtrip_multi_block_pattern() {
9785    let pattern = [7, 13, 44, 184, 19, 96, 171, 109, 141, 251];
9786    let first_block: Vec<u8> = pattern.iter().copied().cycle().take(128 * 1024).collect();
9787    let second_block: Vec<u8> = pattern.iter().copied().cycle().take(128 * 1024).collect();
9788
9789    let mut matcher = RowMatchGenerator::new(1 << 22);
9790    matcher.configure(ROW_CONFIG);
9791    matcher.ensure_tables();
9792    let replay_sequence = |decoded: &mut Vec<u8>, seq: Sequence<'_>| match seq {
9793        Sequence::Literals { literals } => decoded.extend_from_slice(literals),
9794        Sequence::Triple {
9795            literals,
9796            offset,
9797            match_len,
9798        } => {
9799            decoded.extend_from_slice(literals);
9800            let start = decoded.len() - offset;
9801            for i in 0..match_len {
9802                let byte = decoded[start + i];
9803                decoded.push(byte);
9804            }
9805        }
9806    };
9807
9808    matcher.add_data(first_block.clone(), |_| {});
9809    let mut history = Vec::new();
9810    matcher.start_matching(|seq| replay_sequence(&mut history, seq));
9811    assert_eq!(history, first_block);
9812
9813    matcher.add_data(second_block.clone(), |_| {});
9814    let prefix_len = history.len();
9815    matcher.start_matching(|seq| replay_sequence(&mut history, seq));
9816
9817    assert_eq!(&history[prefix_len..], second_block.as_slice());
9818
9819    // Force a literals-only pass so the Sequence::Literals arm is exercised.
9820    let third_block: Vec<u8> = (0u8..=255).collect();
9821    matcher.add_data(third_block.clone(), |_| {});
9822    let third_prefix = history.len();
9823    matcher.start_matching(|seq| replay_sequence(&mut history, seq));
9824    assert_eq!(&history[third_prefix..], third_block.as_slice());
9825}
9826
9827#[test]
9828fn row_short_block_emits_literals_only() {
9829    let mut matcher = RowMatchGenerator::new(1 << 22);
9830    matcher.configure(ROW_CONFIG);
9831
9832    matcher.add_data(b"abcde".to_vec(), |_| {});
9833
9834    let mut saw_triple = false;
9835    let mut reconstructed = Vec::new();
9836    matcher.start_matching(|seq| match seq {
9837        Sequence::Literals { literals } => reconstructed.extend_from_slice(literals),
9838        Sequence::Triple { .. } => saw_triple = true,
9839    });
9840
9841    assert!(
9842        !saw_triple,
9843        "row backend must not emit triples for short blocks"
9844    );
9845    assert_eq!(reconstructed, b"abcde");
9846
9847    // Then feed a clearly matchable block and ensure the Triple arm is reachable.
9848    saw_triple = false;
9849    matcher.add_data(b"abcdeabcde".to_vec(), |_| {});
9850    matcher.start_matching(|seq| {
9851        if let Sequence::Triple { .. } = seq {
9852            saw_triple = true;
9853        }
9854    });
9855    assert!(
9856        saw_triple,
9857        "row backend should emit triples on repeated data"
9858    );
9859}
9860
9861#[test]
9862fn row_pick_lazy_returns_best_when_lookahead_is_out_of_bounds() {
9863    let mut matcher = RowMatchGenerator::new(1 << 22);
9864    matcher.configure(ROW_CONFIG);
9865    matcher.add_data(b"abcabc".to_vec(), |_| {});
9866    // Build the row tables before probing: the lookahead path reaches
9867    // `row_candidate` -> `row_heads[..]` once the accept floor is small
9868    // enough to pass the length gate, so the tables must be allocated
9869    // (production always calls this before any candidate probe).
9870    matcher.ensure_tables();
9871
9872    let best = MatchCandidate {
9873        start: 0,
9874        offset: 1,
9875        match_len: ROW_MIN_MATCH_LEN,
9876    };
9877    let picked = matcher
9878        .pick_lazy_match(0, 0, Some(best))
9879        .expect("best candidate must survive");
9880
9881    assert_eq!(picked.start, best.start);
9882    assert_eq!(picked.offset, best.offset);
9883    assert_eq!(picked.match_len, best.match_len);
9884}
9885
9886#[test]
9887fn row_backfills_previous_block_tail_for_cross_boundary_match() {
9888    let mut matcher = RowMatchGenerator::new(1 << 22);
9889    matcher.configure(ROW_CONFIG);
9890
9891    let mut first_block = alloc::vec![0xA5; 64];
9892    first_block.extend_from_slice(b"XYZ");
9893    let second_block = b"XYZXYZtail".to_vec();
9894
9895    let replay_sequence = |decoded: &mut Vec<u8>, seq: Sequence<'_>| match seq {
9896        Sequence::Literals { literals } => decoded.extend_from_slice(literals),
9897        Sequence::Triple {
9898            literals,
9899            offset,
9900            match_len,
9901        } => {
9902            decoded.extend_from_slice(literals);
9903            let start = decoded.len() - offset;
9904            for i in 0..match_len {
9905                let byte = decoded[start + i];
9906                decoded.push(byte);
9907            }
9908        }
9909    };
9910
9911    matcher.add_data(first_block.clone(), |_| {});
9912    let mut reconstructed = Vec::new();
9913    matcher.start_matching(|seq| replay_sequence(&mut reconstructed, seq));
9914    assert_eq!(reconstructed, first_block);
9915
9916    matcher.add_data(second_block.clone(), |_| {});
9917    let mut saw_cross_boundary = false;
9918    let prefix_len = reconstructed.len();
9919    matcher.start_matching(|seq| {
9920        if let Sequence::Triple {
9921            literals,
9922            offset,
9923            match_len,
9924        } = seq
9925            && literals.is_empty()
9926            && offset == 3
9927            && match_len >= ROW_MIN_MATCH_LEN
9928        {
9929            saw_cross_boundary = true;
9930        }
9931        replay_sequence(&mut reconstructed, seq);
9932    });
9933
9934    assert!(
9935        saw_cross_boundary,
9936        "row matcher should reuse the 3-byte previous-block tail"
9937    );
9938    assert_eq!(&reconstructed[prefix_len..], second_block.as_slice());
9939}
9940
9941#[test]
9942fn row_skip_matching_with_incompressible_hint_uses_sparse_prefix() {
9943    let data = deterministic_high_entropy_bytes(0xA713_9C5D_44E2_10B1, 4096);
9944
9945    let mut dense = RowMatchGenerator::new(1 << 22);
9946    dense.configure(ROW_CONFIG);
9947    dense.add_data(data.clone(), |_| {});
9948    dense.skip_matching_with_hint(Some(false));
9949    let dense_slots = dense
9950        .row_positions
9951        .iter()
9952        .filter(|&&pos| pos != ROW_EMPTY_SLOT)
9953        .count();
9954
9955    let mut sparse = RowMatchGenerator::new(1 << 22);
9956    sparse.configure(ROW_CONFIG);
9957    sparse.add_data(data, |_| {});
9958    sparse.skip_matching_with_hint(Some(true));
9959    let sparse_slots = sparse
9960        .row_positions
9961        .iter()
9962        .filter(|&&pos| pos != ROW_EMPTY_SLOT)
9963        .count();
9964
9965    assert!(
9966        sparse_slots < dense_slots,
9967        "incompressible hint should seed fewer row slots (sparse={sparse_slots}, dense={dense_slots})"
9968    );
9969}
9970
9971/// Regression for the `None` arm of `skip_matching_with_hint`: the
9972/// row table must NOT receive dense inserts across the skipped range.
9973/// Upstream zstd parity (`ZSTD_row_fillHashCache` only pre-fills the next-scan
9974/// cache, not the skipped block's interior) trades cross-block
9975/// matches into the skipped interior for the per-block O(block_size)
9976/// insert cost.
9977///
9978/// At input < 1 block (4096 B with default 128 KiB block boundary),
9979/// the only positions in the row table after the call should be those
9980/// produced by the `backfill_start` lookback at the block's start
9981/// (≤ `ROW_HASH_KEY_LEN - 1` positions when block_start <
9982/// ROW_HASH_KEY_LEN). For `current_abs_start == 0`, even that backfill
9983/// is empty — so the table stays fully empty.
9984#[test]
9985fn row_skip_matching_with_none_hint_leaves_interior_empty() {
9986    let data = deterministic_high_entropy_bytes(0x9B47_F2A1_8C5E_3306, 4096);
9987
9988    let mut none_hint = RowMatchGenerator::new(1 << 22);
9989    none_hint.configure(ROW_CONFIG);
9990    none_hint.add_data(data.clone(), |_| {});
9991    none_hint.skip_matching_with_hint(None);
9992    let none_slots = none_hint
9993        .row_positions
9994        .iter()
9995        .filter(|&&pos| pos != ROW_EMPTY_SLOT)
9996        .count();
9997
9998    // Dense (Some(false), dict-priming path) for comparison — that
9999    // path inserts every position in the skipped range.
10000    let mut dense = RowMatchGenerator::new(1 << 22);
10001    dense.configure(ROW_CONFIG);
10002    dense.add_data(data, |_| {});
10003    dense.skip_matching_with_hint(Some(false));
10004    let dense_slots = dense
10005        .row_positions
10006        .iter()
10007        .filter(|&&pos| pos != ROW_EMPTY_SLOT)
10008        .count();
10009
10010    // Two assertions pin the contract:
10011    // 1) None hint is dramatically sparser than dense (the whole point).
10012    // 2) None hint at block-start==0 inserts ZERO positions (no
10013    //    backfill possible before position 0).
10014    assert_eq!(
10015        none_slots, 0,
10016        "None hint at block_start=0 must leave row table fully empty \
10017         (upstream zstd parity — interior NOT inserted, no pre-block backfill possible)",
10018    );
10019    assert!(
10020        dense_slots > 0,
10021        "Some(false) dict-priming path must still insert densely \
10022         (sanity check: control case for the `none_slots == 0` assertion)",
10023    );
10024}
10025
10026#[test]
10027fn driver_unhinted_level2_keeps_default_dfast_hash_table_size() {
10028    let mut driver = MatchGeneratorDriver::new(32, 2);
10029
10030    driver.reset(CompressionLevel::Level(3));
10031    let mut space = driver.get_next_space();
10032    space[..12].copy_from_slice(b"abcabcabcabc");
10033    space.truncate(12);
10034    driver.commit_space(space);
10035    driver.skip_matching_with_hint(None);
10036
10037    // Upstream zstd-parity split: long-hash at DFAST_HASH_BITS, short-hash one
10038    // bit smaller (DFAST_SHORT_HASH_BITS_DELTA = 1, matching upstream zstd
10039    // `chainLog = hashLog - 1` for dfast levels).
10040    let long_len = driver.dfast_matcher().long_len();
10041    let short_len = driver.dfast_matcher().short_len();
10042    assert_eq!(
10043        long_len,
10044        1 << DFAST_HASH_BITS,
10045        "unhinted Level(2) should keep default long-hash table size"
10046    );
10047    assert_eq!(
10048        short_len,
10049        1 << (DFAST_HASH_BITS - DFAST_SHORT_HASH_BITS_DELTA),
10050        "unhinted Level(2) short-hash should be one bit smaller than long-hash"
10051    );
10052}
10053
10054#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
10055#[test]
10056fn simple_backend_rejects_undersized_pooled_suffix_store() {
10057    let mut driver = MatchGeneratorDriver::new(128 * 1024, 2);
10058    driver.reset(CompressionLevel::Fastest);
10059
10060    driver.suffix_pool.push(SuffixStore::with_capacity(1024));
10061
10062    let mut space = driver.get_next_space();
10063    space.clear();
10064    space.resize(4096, 0xAB);
10065    driver.commit_space(space);
10066
10067    let last_suffix_slots = driver
10068        .simple()
10069        .window
10070        .last()
10071        .expect("window entry must exist after commit")
10072        .suffixes
10073        .slots
10074        .len();
10075    assert!(
10076        last_suffix_slots >= 4096,
10077        "undersized pooled suffix store must not be reused for larger blocks"
10078    );
10079}
10080
10081#[test]
10082fn source_hint_clamps_driver_slice_size_to_window() {
10083    let mut driver = MatchGeneratorDriver::new(128 * 1024, 2);
10084    driver.set_source_size_hint(1024);
10085    driver.reset(CompressionLevel::Default);
10086
10087    let window = driver.window_size() as usize;
10088    assert_eq!(window, 1 << MIN_HINTED_WINDOW_LOG);
10089    assert_eq!(driver.slice_size, window);
10090
10091    let space = driver.get_next_space();
10092    assert_eq!(space.len(), window);
10093    driver.commit_space(space);
10094}
10095
10096#[test]
10097fn pooled_space_keeps_capacity_when_slice_size_shrinks() {
10098    let mut driver = MatchGeneratorDriver::new(128 * 1024, 2);
10099    driver.reset(CompressionLevel::Default);
10100
10101    let large = driver.get_next_space();
10102    let large_capacity = large.capacity();
10103    assert!(large_capacity >= 128 * 1024);
10104    driver.commit_space(large);
10105
10106    driver.set_source_size_hint(1024);
10107    driver.reset(CompressionLevel::Default);
10108
10109    let small = driver.get_next_space();
10110    assert_eq!(small.len(), 1 << MIN_HINTED_WINDOW_LOG);
10111    assert!(
10112        small.capacity() >= large_capacity,
10113        "pooled buffer capacity should be preserved to avoid shrink/grow churn"
10114    );
10115}
10116
10117#[test]
10118fn driver_best_to_fastest_releases_oversized_hc_tables() {
10119    let mut driver = MatchGeneratorDriver::new(32, 2);
10120
10121    // Initialize at Best routed onto HashChain via the test-only override
10122    // (production `Best` sits on level 13, whose native backend differs) —
10123    // allocates large HC tables (4M hash, 2M chain) so the swap below
10124    // exercises the HC drain path this test pins.
10125    driver.reset_on_hc_lazy(CompressionLevel::Best);
10126    assert_eq!(driver.window_size(), (1u64 << 22));
10127
10128    // Feed data so tables are actually allocated via ensure_tables().
10129    let mut space = driver.get_next_space();
10130    space[..12].copy_from_slice(b"abcabcabcabc");
10131    space.truncate(12);
10132    driver.commit_space(space);
10133    driver.skip_matching_with_hint(None);
10134
10135    // Switch to Fastest — the [`MatcherStorage`] enum swaps to the
10136    // `Simple` variant and the `HashChain` variant is dropped. The
10137    // drain block in `Matcher::reset` reassigns
10138    // `m.table.hash_table` / `chain_table` / `hash3_table` to
10139    // `Vec::new()` BEFORE constructing the replacement variant so the
10140    // table backing allocations are released up front — this caps
10141    // peak memory during the swap to "old data buffers being drained
10142    // into `vec_pool` + new `MatchGenerator` skeleton" rather than
10143    // "old tables still resident + new variant under construction".
10144    // The eventual `Drop` on the old variant would release the tables
10145    // anyway, but only after the new variant is built, so the early
10146    // reassign shifts the peak. Post-switch the HC variant no longer
10147    // exists; the assertion that storage is now `Simple` covers the
10148    // invariant the old hash_table/chain_table checks were proxying.
10149    driver.reset(CompressionLevel::Fastest);
10150    assert_eq!(driver.window_size(), (1u64 << 19));
10151    assert_eq!(driver.active_backend(), super::strategy::BackendTag::Simple);
10152}
10153
10154#[test]
10155fn driver_better_to_best_resizes_hc_tables() {
10156    let mut driver = MatchGeneratorDriver::new(32, 2);
10157
10158    // The lazy band runs on the Row backend now, so the HC resize path is
10159    // exercised across two BT levels whose native `HcConfig` widths differ:
10160    // L13 (hash_log 22, chain_log 22) -> L15 (hash_log 23, chain_log 23).
10161    driver.reset(CompressionLevel::Level(13));
10162    assert_eq!(driver.window_size(), (1u64 << 22));
10163
10164    let mut space = driver.get_next_space();
10165    space[..12].copy_from_slice(b"abcabcabcabc");
10166    space.truncate(12);
10167    driver.commit_space(space);
10168    driver.skip_matching_with_hint(None);
10169
10170    let hc = driver.hc_matcher();
10171    let better_hash_len = hc.table.hash_table.len();
10172    let better_chain_len = hc.table.chain_table.len();
10173
10174    // Switch to L15 — must resize to larger tables.
10175    driver.reset(CompressionLevel::Level(15));
10176    assert_eq!(driver.window_size(), (1u64 << 22));
10177
10178    // Feed data to trigger ensure_tables with new sizes.
10179    let mut space = driver.get_next_space();
10180    space[..12].copy_from_slice(b"xyzxyzxyzxyz");
10181    space.truncate(12);
10182    driver.commit_space(space);
10183    driver.skip_matching_with_hint(None);
10184
10185    let hc = driver.hc_matcher();
10186    assert!(
10187        hc.table.hash_table.len() > better_hash_len,
10188        "L15 hash_table ({}) should be larger than L13 ({})",
10189        hc.table.hash_table.len(),
10190        better_hash_len
10191    );
10192    assert!(
10193        hc.table.chain_table.len() > better_chain_len,
10194        "L15 chain_table ({}) should be larger than L13 ({})",
10195        hc.table.chain_table.len(),
10196        better_chain_len
10197    );
10198}
10199
10200#[cfg(any())]
10201// disabled: tests legacy SuffixStore behavior incompatible with upstream zstd-shape kernel's HASH_READ_SIZE geometry
10202#[test]
10203fn prime_with_dictionary_preserves_history_for_first_full_block() {
10204    let mut driver = MatchGeneratorDriver::new(8, 1);
10205    driver.reset(CompressionLevel::Fastest);
10206
10207    driver.prime_with_dictionary(b"abcdefgh", [1, 4, 8]);
10208
10209    let mut space = driver.get_next_space();
10210    space.clear();
10211    space.extend_from_slice(b"abcdefgh");
10212    driver.commit_space(space);
10213
10214    let mut saw_match = false;
10215    driver.start_matching(|seq| {
10216        if let Sequence::Triple {
10217            literals,
10218            offset,
10219            match_len,
10220        } = seq
10221            && literals.is_empty()
10222            && offset == 8
10223            && match_len >= MIN_MATCH_LEN
10224        {
10225            saw_match = true;
10226        }
10227    });
10228
10229    assert!(
10230        saw_match,
10231        "first full block should still match dictionary-primed history"
10232    );
10233}
10234
10235#[cfg(any())]
10236// disabled: tests legacy SuffixStore behavior incompatible with upstream zstd-shape kernel's HASH_READ_SIZE geometry
10237#[test]
10238fn prime_with_large_dictionary_preserves_early_history_until_first_block() {
10239    let mut driver = MatchGeneratorDriver::new(8, 1);
10240    driver.reset(CompressionLevel::Fastest);
10241
10242    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
10243
10244    let mut space = driver.get_next_space();
10245    space.clear();
10246    space.extend_from_slice(b"abcdefgh");
10247    driver.commit_space(space);
10248
10249    let mut saw_match = false;
10250    driver.start_matching(|seq| {
10251        if let Sequence::Triple {
10252            literals,
10253            offset,
10254            match_len,
10255        } = seq
10256            && literals.is_empty()
10257            && offset == 24
10258            && match_len >= MIN_MATCH_LEN
10259        {
10260            saw_match = true;
10261        }
10262    });
10263
10264    assert!(
10265        saw_match,
10266        "dictionary bytes should remain addressable until frame output exceeds the live window"
10267    );
10268}
10269
10270#[test]
10271fn prime_with_dictionary_applies_offset_history_even_when_content_is_empty() {
10272    let mut driver = MatchGeneratorDriver::new(8, 1);
10273    driver.reset(CompressionLevel::Fastest);
10274
10275    driver.prime_with_dictionary(&[], [11, 7, 3]);
10276
10277    assert_eq!(driver.simple_mut().offset_hist, [11, 7, 3]);
10278}
10279
10280#[test]
10281fn hc_prime_with_empty_dictionary_disables_btultra2_seed_pass() {
10282    let mut driver = MatchGeneratorDriver::new(8, 1);
10283    driver.reset_on_hc_lazy(CompressionLevel::Better);
10284
10285    driver.prime_with_dictionary(&[], [11, 7, 3]);
10286
10287    assert_eq!(driver.hc_matcher().table.offset_hist, [11, 7, 3]);
10288    assert!(
10289        !driver
10290            .hc_matcher()
10291            .should_run_btultra2_seed_pass::<super::strategy::BtUltra2>(HC_PREDEF_THRESHOLD + 1),
10292        "btultra2 warmup must stay disabled after dictionary priming, even when dict content is empty"
10293    );
10294}
10295
10296#[test]
10297fn primed_snapshot_not_restored_across_ldm_config_change() {
10298    // The CDict-equivalent primed snapshot clones `storage`, which on the
10299    // BT backend carries `BtMatcher::ldm_producer`. A snapshot captured
10300    // under one LDM configuration must NOT be restored into a reset that
10301    // resolved a different LDM configuration (else the restored producer
10302    // is stale). `PrimedKey` must fold the LDM override into the key so
10303    // such a restore is refused and the caller re-primes.
10304    use super::parameters::CompressionParameters;
10305
10306    let dict = b"abcdefghabcdefghabcdefgh";
10307    let ldm_on = CompressionParameters::builder(CompressionLevel::Level(19))
10308        .enable_long_distance_matching(true)
10309        .build()
10310        .unwrap()
10311        .overrides();
10312    let ldm_off = CompressionParameters::builder(CompressionLevel::Level(19))
10313        .build()
10314        .unwrap()
10315        .overrides();
10316
10317    let mut driver = MatchGeneratorDriver::new(1024, 1);
10318
10319    // Capture a snapshot primed under LDM-on at level 19.
10320    driver.set_param_overrides(Some(ldm_on));
10321    driver.reset(CompressionLevel::Level(19));
10322    driver.prime_with_dictionary(dict, [1, 4, 8]);
10323    driver.capture_primed_dictionary(CompressionLevel::Level(19));
10324
10325    // Same dictionary + level, but LDM now OFF: the snapshot's LDM state
10326    // is stale, so restore must be refused.
10327    driver.set_param_overrides(Some(ldm_off));
10328    driver.reset(CompressionLevel::Level(19));
10329    assert!(
10330        !driver.restore_primed_dictionary(CompressionLevel::Level(19)),
10331        "primed snapshot restored across an LDM config change (stale producer)",
10332    );
10333
10334    // Sanity: re-priming + capturing under LDM-off, then restoring under
10335    // the IDENTICAL LDM-off config DOES match (the key is not over-tight).
10336    driver.prime_with_dictionary(dict, [1, 4, 8]);
10337    driver.capture_primed_dictionary(CompressionLevel::Level(19));
10338    driver.reset(CompressionLevel::Level(19));
10339    assert!(
10340        driver.restore_primed_dictionary(CompressionLevel::Level(19)),
10341        "primed snapshot not restored under identical LDM config",
10342    );
10343}
10344
10345#[test]
10346fn hc_prime_with_dictionary_disables_btultra2_seed_pass() {
10347    let mut driver = MatchGeneratorDriver::new(8, 1);
10348    driver.reset_on_hc_lazy(CompressionLevel::Better);
10349
10350    driver.prime_with_dictionary(b"abcdefgh", [1, 4, 8]);
10351
10352    assert!(
10353        !driver
10354            .hc_matcher()
10355            .should_run_btultra2_seed_pass::<super::strategy::BtUltra2>(HC_PREDEF_THRESHOLD + 1),
10356        "btultra2 warmup must stay disabled after dictionary priming with content"
10357    );
10358}
10359
10360#[test]
10361fn dfast_prime_with_dictionary_preserves_history_for_first_full_block() {
10362    let mut driver = MatchGeneratorDriver::new(8, 1);
10363    // Level(4) is Dfast with the greedy double-fast loop (upstream zstd parity:
10364    // clevels.h L3/L4 are both `ZSTD_dfast`, which has no lazy lookahead).
10365    // The fast loop needs at least `HASH_READ_SIZE` (8) bytes ahead of the
10366    // probe cursor, so this exercises a 16-byte dict + 16-byte block (the
10367    // whole block matches the dict, offset = dict length = 16).
10368    driver.reset(CompressionLevel::Level(4));
10369
10370    let payload = b"abcdefghijklmnop";
10371    driver.prime_with_dictionary(payload, [1, 4, 8]);
10372
10373    let mut space = driver.get_next_space();
10374    space.clear();
10375    space.extend_from_slice(payload);
10376    driver.commit_space(space);
10377
10378    let mut saw_match = false;
10379    driver.start_matching(|seq| {
10380        if let Sequence::Triple {
10381            literals,
10382            offset,
10383            match_len,
10384        } = seq
10385            && literals.is_empty()
10386            && offset == payload.len()
10387            && match_len >= DFAST_MIN_MATCH_LEN
10388        {
10389            saw_match = true;
10390        }
10391    });
10392
10393    assert!(
10394        saw_match,
10395        "dfast backend should match dictionary-primed history in first full block"
10396    );
10397}
10398
10399#[test]
10400fn prime_with_dictionary_does_not_inflate_reported_window_size() {
10401    let mut driver = MatchGeneratorDriver::new(8, 1);
10402    driver.reset(CompressionLevel::Fastest);
10403
10404    let before = driver.window_size();
10405    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
10406    let after = driver.window_size();
10407
10408    assert_eq!(
10409        after, before,
10410        "dictionary retention budget must not change reported frame window size"
10411    );
10412}
10413
10414#[test]
10415fn primed_snapshot_not_restored_when_window_hint_differs() {
10416    // The copy-snapshot must be keyed on the resolved reset parameters, not
10417    // just the CompressionLevel. `reset()` caps window_log by the source-size
10418    // hint, so two same-level frames with different hints resolve to different
10419    // windows. Restoring a snapshot captured at the larger hint into a reset
10420    // for the smaller hint would advertise the smaller window in the frame
10421    // header while the matcher's `max_window_size` (from the restored storage)
10422    // still spans the larger window — the encoder could then emit a match
10423    // (e.g. into the dictionary) past the advertised window, producing an
10424    // undecodable frame. Restore must REFUSE when the resolved window differs.
10425    let mut driver = MatchGeneratorDriver::new(8, 1);
10426    let level = CompressionLevel::Best;
10427
10428    // Frame A: large hint → larger resolved window. Prime + capture.
10429    driver.set_source_size_hint(256 * 1024);
10430    driver.reset(level);
10431    let big_window = driver.window_size();
10432    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
10433    driver.capture_primed_dictionary(level);
10434
10435    // Frame B: smaller hint, SAME level → smaller resolved window.
10436    driver.set_source_size_hint(48 * 1024);
10437    driver.reset(level);
10438    let small_window = driver.window_size();
10439    assert!(
10440        small_window < big_window,
10441        "precondition: the two hints must resolve to different windows \
10442         (small={small_window}, big={big_window})"
10443    );
10444
10445    let restored = driver.restore_primed_dictionary(level);
10446    assert!(
10447        !restored,
10448        "snapshot captured at window {big_window} must NOT be restored into a \
10449         reset advertising window {small_window} (level alone is an insufficient key)"
10450    );
10451}
10452
10453#[test]
10454fn primed_snapshot_restored_for_hints_in_same_window_bucket() {
10455    // The snapshot key must normalize the source-size hint to the resolved
10456    // matcher geometry, not the raw hinted byte count. `reset()` derives every
10457    // hint-dependent parameter (window_log cap, HC/Fast/Dfast/Row table widths,
10458    // the Fast attach-vs-copy cutoff) from `ceil_log2(hint)`, so two distinct
10459    // hints that share a ceil-log bucket resolve to the *identical* matcher
10460    // shape. Keying on the raw bytes over-keys: it forces a full re-prime on the
10461    // second frame even though the cached snapshot is a perfect fit. Restore
10462    // must SUCCEED across same-bucket hints.
10463    let mut driver = MatchGeneratorDriver::new(8, 1);
10464    let level = CompressionLevel::Best;
10465
10466    // Both hints fall in ceil_log2 bucket 19 (2^18 < n <= 2^19): 300 KiB and
10467    // 400 KiB resolve to the same window and table widths.
10468    driver.set_source_size_hint(300 * 1024);
10469    driver.reset(level);
10470    let window_a = driver.window_size();
10471    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
10472    driver.capture_primed_dictionary(level);
10473
10474    driver.set_source_size_hint(400 * 1024);
10475    driver.reset(level);
10476    let window_b = driver.window_size();
10477    assert_eq!(
10478        window_a, window_b,
10479        "precondition: same-bucket hints must resolve to the same window \
10480         (a={window_a}, b={window_b})"
10481    );
10482
10483    let restored = driver.restore_primed_dictionary(level);
10484    assert!(
10485        restored,
10486        "snapshot captured at a 300 KiB hint must be restored into a 400 KiB \
10487         hint that resolves to the identical matcher shape (raw bytes over-key)"
10488    );
10489}
10490
10491#[test]
10492fn primed_snapshot_restored_across_level22_tier_hints() {
10493    // Level 22 collapses several ceil-log buckets onto one upstream zstd source-size
10494    // tier: `resolve_level_params(Level(22), ..)` selects the HC config and
10495    // window_log by raw `<= 16 KiB / 128 KiB / 256 KiB` thresholds, so a 20 KiB
10496    // and a 100 KiB hint (ceil-log buckets 15 and 17) both land in the
10497    // `<= 128 KiB` tier and resolve to the IDENTICAL matcher (same window_log,
10498    // same HC hash/chain/search geometry). Keying on the raw ceil-log bucket
10499    // would still reject the restore here because the buckets differ; the key
10500    // must compare the resolved matcher shape so these share one snapshot.
10501    let mut driver = MatchGeneratorDriver::new(8, 1);
10502    let level = CompressionLevel::Level(22);
10503
10504    driver.set_source_size_hint(20 * 1024);
10505    driver.reset(level);
10506    let window_a = driver.window_size();
10507    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
10508    driver.capture_primed_dictionary(level);
10509
10510    driver.set_source_size_hint(100 * 1024);
10511    driver.reset(level);
10512    let window_b = driver.window_size();
10513    assert_eq!(
10514        window_a, window_b,
10515        "precondition: both hints must land in the same Level 22 upstream zstd tier \
10516         (a={window_a}, b={window_b})"
10517    );
10518
10519    let restored = driver.restore_primed_dictionary(level);
10520    assert!(
10521        restored,
10522        "Level 22 snapshot captured at a 20 KiB hint must be restored into a \
10523         100 KiB hint that resolves to the same upstream zstd tier (different ceil-log \
10524         buckets, identical matcher shape)"
10525    );
10526}
10527
10528#[test]
10529fn fast_dict_attaches_within_cutoff_bounds() {
10530    // Within the attach bounds, every Fast dict frame attaches (the copy-mode
10531    // owned path memmoved the whole input into history each frame; attach scans
10532    // the input in place via the borrowed dual-base kernel). All hints here sit
10533    // far below `FAST_ATTACH_DICT_CUTOFF_LOG` (2 GiB source) and the dict is far
10534    // below `MAX_FAST_ATTACH_DICT_REGION` (16 MiB), so a hint that used to cross
10535    // the old 8 KiB cutoff (8193 B) and a small one (8192 B) BOTH resolve to
10536    // attach, and the Simple backend reports a borrowed (in-place) dict scan for
10537    // both. This guards `FAST_ATTACH_DICT_CUTOFF_LOG` staying high enough that no
10538    // in-bounds Fast hint falls back to the input-copy path; the OUT-of-bounds
10539    // fallbacks are covered by `fast_attach_cutoff_keeps_virtual_positions_within_u32`
10540    // (source) and `oversized_dict_hint_routes_fast_to_copy_mode` (dict size).
10541    let level = CompressionLevel::Level(1);
10542    for hint in [8192u64, 8193, 1 << 20] {
10543        let mut driver = MatchGeneratorDriver::new(8, 1);
10544        driver.set_source_size_hint(hint);
10545        driver.reset(level);
10546        driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
10547        assert!(
10548            driver.borrowed_dict_supported(),
10549            "Fast dict frame with hint {hint} must attach (borrowed in-place \
10550             dict scan), never fall back to the copy-mode input-copy path"
10551        );
10552    }
10553}
10554
10555#[test]
10556fn fast_attach_cutoff_keeps_virtual_positions_within_u32() {
10557    // The cutoff is 31, NOT the full u64 source-size range, because the borrowed
10558    // dict kernel stores virtual positions as u32 (`cur_abs as u32`). The largest
10559    // attached source `1 << CUTOFF` (plus the dict prefix) must stay below
10560    // u32::MAX or that arithmetic wraps; the next bucket (4 GiB) would. This pins
10561    // the bound so a future "just raise it to attach everything" change cannot
10562    // silently reintroduce the overflow — raising the cutoff requires widening
10563    // the kernel's position type first.
10564    let max_attached: u64 = 1u64 << FAST_ATTACH_DICT_CUTOFF_LOG;
10565    assert!(
10566        max_attached <= u32::MAX as u64,
10567        "the largest attached source 2^{FAST_ATTACH_DICT_CUTOFF_LOG} must fit u32 \
10568         virtual positions",
10569    );
10570    assert!(
10571        (1u64 << (FAST_ATTACH_DICT_CUTOFF_LOG + 1)) > u32::MAX as u64,
10572        "the next bucket 2^{} would overflow u32 virtual positions",
10573        FAST_ATTACH_DICT_CUTOFF_LOG + 1,
10574    );
10575}
10576
10577#[test]
10578fn oversized_dict_hint_routes_fast_to_copy_mode() {
10579    // A dict whose region exceeds the tagged attach position field
10580    // (`MAX_FAST_ATTACH_DICT_REGION`, 16 MiB) must route the Fast prime to COPY
10581    // mode instead of the tagged attach fill, which would overflow the packed
10582    // position. The decision is keyed on the load-set size hint, so a hint past
10583    // the limit suffices to exercise it without allocating a real 16 MiB dict.
10584    // Copy mode leaves the borrowed in-place dict scan (attach-only) unavailable.
10585    let mut driver = MatchGeneratorDriver::new(8, 1);
10586    driver.set_dictionary_size_hint(MAX_FAST_ATTACH_DICT_REGION + 1);
10587    driver.reset(CompressionLevel::Level(1));
10588    driver.prime_with_dictionary(b"small dict content with some padding here", [1, 4, 8]);
10589    assert!(
10590        !driver.borrowed_dict_supported(),
10591        "an oversized dict must use copy mode, not the tagged attach fill"
10592    );
10593}
10594
10595#[test]
10596fn block_samples_match_dict_is_true_for_non_simple_backend() {
10597    // Production fallback: a non-Simple backend (here Row, Level 6) has no dict
10598    // probe, so the driver wrapper answers CONSERVATIVELY `true` for ANY block —
10599    // keeping the dict frame on the scan rather than letting the raw-fast-path
10600    // emit a block raw and miss an embedded dict segment (see
10601    // `dictionary_segment_in_incompressible_input_is_matched`). Only the
10602    // Simple/Fast backend trades the blanket scan for a precise probe.
10603    let dict = b"the quick brown fox jumps over the lazy dog 0123456789abcdef";
10604    let mut row = MatchGeneratorDriver::new(8, 6);
10605    row.set_dictionary_size_hint(dict.len());
10606    row.reset(CompressionLevel::Level(6));
10607    row.prime_with_dictionary(dict, [1, 4, 8]);
10608    assert!(
10609        row.block_samples_match_dict(&dict[..32]),
10610        "non-Simple backend must stay on the scan (true) for a dict frame"
10611    );
10612    let random: alloc::vec::Vec<u8> = (0..64u8)
10613        .map(|i| i.wrapping_mul(37).wrapping_add(13))
10614        .collect();
10615    assert!(
10616        row.block_samples_match_dict(&random),
10617        "non-Simple backend reports true regardless of block content"
10618    );
10619}
10620
10621#[test]
10622fn primed_snapshot_fast_attach_does_not_over_key_non_simple_backends() {
10623    // `fast_attach` is a Simple/Fast-backend concept (the 8 KiB attach-vs-copy
10624    // table split). Dfast/Row/HashChain each have their OWN attach/copy regime
10625    // (`DFAST_ATTACH_DICT_CUTOFF_LOG`, `ROW_ATTACH_DICT_CUTOFF_LOG`,
10626    // `HC_ATTACH_DICT_CUTOFF_LOG`) but those are deliberately kept OUT of the
10627    // `fast_attach` key, which only models the Fast table split. Their snapshots
10628    // are keyed by the resolved matcher geometry instead, and the HC modes share
10629    // one window geometry so an HC cross-mode restore stays decodable (see
10630    // `prime_with_dictionary`). Either way the `fast_attach`
10631    // bit must NOT enter a non-Simple snapshot key — otherwise an unhinted
10632    // capture (which would record `fast_attach = true`) and a hinted reset that
10633    // resolves to the IDENTICAL `LevelParams` would key differently and force a
10634    // needless re-prime. `Best` is a Row-backend lazy
10635    // level; this also pins the Row arm recording its RESOLVED hash width on
10636    // the unhinted path (a 0 default there keyed unhinted-vs-hinted apart).
10637    // An explicit Row-backend level: `Best` now sits on level 13 (Btlazy2),
10638    // so the named alias no longer reaches the Row arm this test pins.
10639    let mut driver = MatchGeneratorDriver::new(8, 1);
10640    let level = CompressionLevel::Level(12);
10641
10642    // Capture with no hint.
10643    driver.reset(level);
10644    let window_a = driver.window_size();
10645    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
10646    driver.capture_primed_dictionary(level);
10647
10648    // Reset with a hint large enough to resolve to the same window/params as
10649    // the unhinted level (>= 2^window_log, so the source-size cap is a no-op).
10650    driver.set_source_size_hint(64 * 1024 * 1024);
10651    driver.reset(level);
10652    let window_b = driver.window_size();
10653    assert_eq!(
10654        window_a, window_b,
10655        "precondition: the large hint must resolve to the same window as the \
10656         unhinted level (a={window_a}, b={window_b})"
10657    );
10658
10659    let restored = driver.restore_primed_dictionary(level);
10660    assert!(
10661        restored,
10662        "a Row snapshot must restore across an unhinted vs large-hinted \
10663         reset that resolves to the identical matcher — `fast_attach` is a Fast \
10664         backend concept and must not over-key non-Simple shapes"
10665    );
10666}
10667
10668#[cfg(any())] // disabled: tested SuffixStore-per-block tail-handling specific to legacy MatchGenerator
10669#[test]
10670fn prime_with_dictionary_does_not_reuse_tiny_suffix_store() {
10671    let mut driver = MatchGeneratorDriver::new(8, 2);
10672    driver.reset(CompressionLevel::Fastest);
10673
10674    // This dictionary leaves a 1-byte tail chunk (capacity=1 suffix table),
10675    // which should never be committed to the matcher window.
10676    driver.prime_with_dictionary(b"abcdefghi", [1, 4, 8]);
10677
10678    assert!(
10679        driver
10680            .simple()
10681            .window
10682            .iter()
10683            .all(|entry| entry.data.len() >= MIN_MATCH_LEN),
10684        "dictionary priming must not commit tails shorter than MIN_MATCH_LEN"
10685    );
10686}
10687
10688#[test]
10689fn prime_with_dictionary_counts_only_committed_tail_budget() {
10690    let mut driver = MatchGeneratorDriver::new(8, 1);
10691    driver.reset(CompressionLevel::Fastest);
10692
10693    let before = driver.simple_mut().max_window_size;
10694    // One full slice plus a 1-byte tail that cannot be committed.
10695    driver.prime_with_dictionary(b"abcdefghi", [1, 4, 8]);
10696
10697    assert_eq!(
10698        driver.simple_mut().max_window_size,
10699        before + 8,
10700        "retention budget must account only for dictionary bytes actually committed to history"
10701    );
10702}
10703
10704#[test]
10705fn dfast_prime_with_dictionary_counts_four_byte_tail_budget() {
10706    let mut driver = MatchGeneratorDriver::new(8, 1);
10707    driver.reset(CompressionLevel::Level(3));
10708
10709    let before = driver.dfast_matcher().max_window_size;
10710    // One full slice plus a 4-byte tail. Dfast can still use this tail through
10711    // short-hash overlap into the next block, so it should stay retained.
10712    driver.prime_with_dictionary(b"abcdefghijkl", [1, 4, 8]);
10713
10714    assert_eq!(
10715        driver.dfast_matcher().max_window_size,
10716        before + 12,
10717        "dfast retention budget should include 4-byte dictionary tails"
10718    );
10719}
10720
10721#[test]
10722fn row_prime_with_dictionary_preserves_history_for_first_full_block() {
10723    let mut driver = MatchGeneratorDriver::new(8, 1);
10724    // Level(5) is the greedy Row backend (LEVEL_TABLE row 5: Greedy / RowHash).
10725    // Level(4) now routes to Dfast, so this test must use Level(5) to actually
10726    // exercise `RowMatchGenerator`'s dictionary priming. The 16-byte dict +
10727    // 16-byte block lets the whole block match the primed dict (offset = dict
10728    // length = 16).
10729    driver.reset(CompressionLevel::Level(5));
10730
10731    let payload = b"abcdefghijklmnop";
10732    driver.prime_with_dictionary(payload, [1, 4, 8]);
10733
10734    let mut space = driver.get_next_space();
10735    space.clear();
10736    space.extend_from_slice(payload);
10737    driver.commit_space(space);
10738
10739    let mut saw_match = false;
10740    driver.start_matching(|seq| {
10741        if let Sequence::Triple {
10742            literals,
10743            offset,
10744            match_len,
10745        } = seq
10746            && literals.is_empty()
10747            && offset == payload.len()
10748            && match_len >= ROW_MIN_MATCH_LEN
10749        {
10750            saw_match = true;
10751        }
10752    });
10753
10754    assert!(
10755        saw_match,
10756        "row backend should match dictionary-primed history in first full block"
10757    );
10758}
10759
10760#[test]
10761fn row_prime_with_dictionary_subtracts_uncommitted_tail_budget() {
10762    let mut driver = MatchGeneratorDriver::new(8, 1);
10763    driver.reset(CompressionLevel::Level(5));
10764
10765    let base_window = driver.row_matcher().max_window_size;
10766    // Slice size is 8. The trailing byte cannot be committed (<4 tail),
10767    // so it must be subtracted from retained budget.
10768    driver.prime_with_dictionary(b"abcdefghi", [1, 4, 8]);
10769
10770    assert_eq!(
10771        driver.row_matcher().max_window_size,
10772        base_window + 8,
10773        "row retained window must exclude uncommitted 1-byte tail"
10774    );
10775}
10776
10777#[test]
10778fn prime_with_dictionary_budget_shrinks_after_row_eviction() {
10779    let mut driver = MatchGeneratorDriver::new(8, 1);
10780    driver.reset(CompressionLevel::Level(5));
10781    // Keep live window tiny so dictionary-primed slices are evicted quickly.
10782    driver.row_matcher_mut().max_window_size = 8;
10783    driver.reported_window_size = 8;
10784
10785    let base_window = driver.row_matcher().max_window_size;
10786    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
10787    assert_eq!(driver.row_matcher().max_window_size, base_window + 24);
10788
10789    for block in [b"AAAAAAAA", b"BBBBBBBB"] {
10790        let mut space = driver.get_next_space();
10791        space.clear();
10792        space.extend_from_slice(block);
10793        driver.commit_space(space);
10794        driver.skip_matching_with_hint(None);
10795    }
10796
10797    assert_eq!(
10798        driver.dictionary_retained_budget, 0,
10799        "dictionary budget should be fully retired once primed dict slices are evicted"
10800    );
10801    assert_eq!(
10802        driver.row_matcher().max_window_size,
10803        base_window,
10804        "retired dictionary budget must not remain reusable for live history"
10805    );
10806}
10807
10808/// Row → Simple transition drops the Row variant and the
10809/// post-switch active backend is exactly Simple. The window-emptied
10810/// check from the pre-enum era (`driver.row_matcher().window.is_empty()`)
10811/// is intentionally gone — the `Row` variant no longer exists after
10812/// the swap, so there is nothing to inspect by accessor; the "window
10813/// cleared" invariant is replaced by "variant dropped", and a
10814/// subsequent `row_matcher()` call would panic by design. The
10815/// pool-recycling side of the row backend is covered by
10816/// [`driver_row_commit_recycles_block_buffer_into_pool`].
10817#[test]
10818fn row_get_last_space_then_reset_to_fastest_drops_row_variant() {
10819    let mut driver = MatchGeneratorDriver::new(8, 1);
10820    driver.reset(CompressionLevel::Level(5));
10821    assert_eq!(driver.active_backend(), super::strategy::BackendTag::Row);
10822
10823    let mut space = driver.get_next_space();
10824    space.clear();
10825    space.extend_from_slice(b"row-data");
10826    driver.commit_space(space);
10827
10828    assert_eq!(driver.get_last_space(), b"row-data");
10829
10830    driver.reset(CompressionLevel::Fastest);
10831    assert_eq!(driver.active_backend(), super::strategy::BackendTag::Simple);
10832}
10833
10834/// Committing a Row block must return the input buffer to `vec_pool`
10835/// immediately (the bytes are mirrored into the contiguous `history`,
10836/// so there is no reason to retain a second copy in the window). This
10837/// guards the chunk-length window: the previous `VecDeque<Vec<u8>>`
10838/// window retained a full `block_capacity` buffer per committed block,
10839/// which on a heavily pre-split frame ballooned peak memory to many
10840/// times the live byte count. With the buffer recycled at commit time
10841/// the pool grows by exactly one Vec per committed block.
10842#[test]
10843fn driver_row_commit_recycles_block_buffer_into_pool() {
10844    let mut driver = MatchGeneratorDriver::new(8, 1);
10845    driver.reset(CompressionLevel::Level(5));
10846    assert_eq!(driver.active_backend(), super::strategy::BackendTag::Row);
10847
10848    let before_pool = driver.vec_pool.len();
10849    let mut space = driver.get_next_space();
10850    space.clear();
10851    space.extend_from_slice(b"row-data-to-recycle");
10852    driver.commit_space(space);
10853
10854    // `>` not `>=`: a fresh driver starts with `before_pool == 0`, so the
10855    // weaker bound passes even if the commit failed to recycle. Strict
10856    // growth proves the buffer was returned to the pool at commit time
10857    // rather than retained in the window (the pre-`chunk_lens` bug).
10858    assert!(
10859        driver.vec_pool.len() > before_pool,
10860        "row commit must recycle the committed block buffer into vec_pool \
10861         (before_pool = {before_pool}, after = {})",
10862        driver.vec_pool.len()
10863    );
10864    // The bytes still resolve through the contiguous history mirror.
10865    assert_eq!(driver.get_last_space(), b"row-data-to-recycle");
10866}
10867
10868#[test]
10869fn adjust_params_for_zero_source_size_uses_min_hinted_window_floor() {
10870    let mut params = resolve_level_params(CompressionLevel::Level(4), None);
10871    params.window_log = 22;
10872    let adjusted = adjust_params_for_source_size(params, 0);
10873    assert_eq!(adjusted.window_log, MIN_HINTED_WINDOW_LOG);
10874}
10875
10876#[test]
10877fn common_prefix_len_matches_scalar_reference_across_offsets() {
10878    fn scalar_reference(a: &[u8], b: &[u8]) -> usize {
10879        a.iter()
10880            .zip(b.iter())
10881            .take_while(|(lhs, rhs)| lhs == rhs)
10882            .count()
10883    }
10884
10885    for total_len in [
10886        0usize, 1, 5, 15, 16, 17, 31, 32, 33, 64, 65, 127, 191, 257, 320,
10887    ] {
10888        let base: Vec<u8> = (0..total_len)
10889            .map(|i| ((i * 13 + 7) & 0xFF) as u8)
10890            .collect();
10891
10892        for start in [0usize, 1, 3] {
10893            if start > total_len {
10894                continue;
10895            }
10896            let a = &base[start..];
10897            let b = a.to_vec();
10898            assert_eq!(
10899                common_prefix_len(a, &b),
10900                scalar_reference(a, &b),
10901                "equal slices total_len={total_len} start={start}"
10902            );
10903
10904            let len = a.len();
10905            for mismatch in [0usize, 1, 7, 15, 16, 31, 32, 47, 63, 95, 127, 128, 129, 191] {
10906                if mismatch >= len {
10907                    continue;
10908                }
10909                let mut altered = b.clone();
10910                altered[mismatch] ^= 0x5A;
10911                assert_eq!(
10912                    common_prefix_len(a, &altered),
10913                    scalar_reference(a, &altered),
10914                    "total_len={total_len} start={start} mismatch={mismatch}"
10915                );
10916            }
10917
10918            if len > 0 {
10919                let mismatch = len - 1;
10920                let mut altered = b.clone();
10921                altered[mismatch] ^= 0xA5;
10922                assert_eq!(
10923                    common_prefix_len(a, &altered),
10924                    scalar_reference(a, &altered),
10925                    "tail mismatch total_len={total_len} start={start} mismatch={mismatch}"
10926                );
10927            }
10928        }
10929    }
10930
10931    let long = alloc::vec![0xAB; 320];
10932    let shorter = alloc::vec![0xAB; 137];
10933    assert_eq!(
10934        common_prefix_len(&long, &shorter),
10935        scalar_reference(&long, &shorter)
10936    );
10937}
10938
10939#[test]
10940fn row_pick_lazy_returns_none_when_next_is_better() {
10941    let mut matcher = RowMatchGenerator::new(1 << 22);
10942    matcher.configure(ROW_CONFIG);
10943    matcher.add_data(alloc::vec![b'a'; 64], |_| {});
10944    matcher.ensure_tables();
10945
10946    let abs_pos = matcher.history_abs_start + 16;
10947    let best = MatchCandidate {
10948        start: abs_pos,
10949        offset: 8,
10950        match_len: ROW_MIN_MATCH_LEN,
10951    };
10952    assert!(
10953        matcher.pick_lazy_match(abs_pos, 0, Some(best)).is_none(),
10954        "lazy picker should defer when next position is clearly better"
10955    );
10956}
10957
10958#[test]
10959fn row_pick_lazy_depth2_returns_none_when_next2_significantly_better() {
10960    let mut matcher = RowMatchGenerator::new(1 << 22);
10961    matcher.configure(ROW_CONFIG);
10962    matcher.lazy_depth = 2;
10963    matcher.search_depth = 0;
10964    matcher.offset_hist = [6, 9, 1];
10965
10966    let mut data = alloc::vec![b'x'; 40];
10967    data[11..30].copy_from_slice(b"EFABCABCAEFABCAEFAB");
10968    matcher.add_data(data, |_| {});
10969    matcher.ensure_tables();
10970
10971    let abs_pos = matcher.history_abs_start + 20;
10972    let best = matcher
10973        .best_match(abs_pos, 0)
10974        .expect("expected baseline repcode match");
10975    assert_eq!(best.offset, 9);
10976    // Baseline match length is fixed by the fixture data (the offset-9
10977    // rep run is 6 bytes long), independent of the accept threshold.
10978    assert_eq!(best.match_len, 6);
10979
10980    if let Some(next) = matcher.best_match(abs_pos + 1, 1) {
10981        assert!(next.match_len <= best.match_len);
10982    }
10983
10984    let next2 = matcher
10985        .best_match(abs_pos + 2, 2)
10986        .expect("expected +2 candidate");
10987    assert!(
10988        next2.match_len > best.match_len + 1,
10989        "+2 candidate must be significantly better for depth-2 lazy skip"
10990    );
10991    assert!(
10992        matcher.pick_lazy_match(abs_pos, 0, Some(best)).is_none(),
10993        "lazy picker should defer when +2 candidate is significantly better"
10994    );
10995}
10996
10997#[test]
10998fn row_pick_lazy_depth2_keeps_best_when_next2_is_only_one_byte_better() {
10999    let mut matcher = RowMatchGenerator::new(1 << 22);
11000    matcher.configure(ROW_CONFIG);
11001    matcher.lazy_depth = 2;
11002    matcher.search_depth = 0;
11003    matcher.offset_hist = [6, 9, 1];
11004
11005    let mut data = alloc::vec![b'x'; 40];
11006    data[11..30].copy_from_slice(b"EFABCABCAEFABCAEFAZ");
11007    matcher.add_data(data, |_| {});
11008    matcher.ensure_tables();
11009
11010    let abs_pos = matcher.history_abs_start + 20;
11011    let best = matcher
11012        .best_match(abs_pos, 0)
11013        .expect("expected baseline repcode match");
11014    assert_eq!(best.offset, 9);
11015    // Baseline match length is fixed by the fixture data (the offset-9
11016    // rep run is 6 bytes long), independent of the accept threshold.
11017    assert_eq!(best.match_len, 6);
11018
11019    let next2 = matcher
11020        .best_match(abs_pos + 2, 2)
11021        .expect("expected +2 candidate");
11022    assert_eq!(next2.match_len, best.match_len + 1);
11023    let chosen = matcher
11024        .pick_lazy_match(abs_pos, 0, Some(best))
11025        .expect("lazy picker should keep current best");
11026    assert_eq!(chosen.start, best.start);
11027    assert_eq!(chosen.offset, best.offset);
11028    assert_eq!(chosen.match_len, best.match_len);
11029}
11030
11031/// Verifies row/tag extraction uses the shared hash mix bit-splitting contract.
11032#[test]
11033fn row_hash_and_row_extracts_high_bits() {
11034    let mut matcher = RowMatchGenerator::new(1 << 22);
11035    matcher.configure(ROW_CONFIG);
11036    matcher.add_data(
11037        alloc::vec![
11038            0xAA, 0xBB, 0xCC, 0x11, 0x10, 0x20, 0x30, 0x40, 0xAA, 0xBB, 0xCC, 0x22, 0x50, 0x60,
11039            0x70, 0x80,
11040        ],
11041        |_| {},
11042    );
11043    matcher.ensure_tables();
11044
11045    let pos = matcher.history_abs_start + 8;
11046    let (row, tag) = matcher
11047        .hash_and_row(pos)
11048        .expect("row hash should be available");
11049
11050    let idx = pos - matcher.history_abs_start;
11051    let concat = matcher.live_history();
11052    // Mirror `row_key_value`: an mls-wide masked key when 8 lookahead bytes
11053    // exist, the 4-byte key in the tail. `idx = 8` on a 16-byte history has
11054    // exactly 8 bytes left, so the wide arm applies here.
11055    let key_len = matcher.mls.min(6);
11056    let value = u64::from_le_bytes(concat[idx..idx + 8].try_into().unwrap())
11057        & ((1u64 << (key_len * 8)) - 1);
11058    let hash = crate::encoding::fastpath::hash_mix_u64_with_kernel(matcher.hash_kernel, value);
11059    let total_bits = matcher.row_hash_log + ROW_TAG_BITS;
11060    let combined = hash >> (u64::BITS as usize - total_bits);
11061    let expected_row =
11062        ((combined >> ROW_TAG_BITS) as usize) & ((1usize << matcher.row_hash_log) - 1);
11063    let expected_tag = combined as u8;
11064
11065    assert_eq!(row, expected_row);
11066    assert_eq!(tag, expected_tag);
11067}
11068
11069#[test]
11070fn row_repcode_skips_candidate_before_history_start() {
11071    let mut matcher = RowMatchGenerator::new(1 << 22);
11072    matcher.configure(ROW_CONFIG);
11073    matcher.history = alloc::vec![b'a'; 20];
11074    matcher.history_start = 0;
11075    matcher.history_abs_start = 10;
11076    matcher.offset_hist = [3, 0, 0];
11077
11078    assert!(matcher.repcode_candidate(12, 1).is_none());
11079}
11080
11081#[test]
11082fn row_repcode_returns_none_when_position_too_close_to_history_end() {
11083    let mut matcher = RowMatchGenerator::new(1 << 22);
11084    matcher.configure(ROW_CONFIG);
11085    matcher.history = b"abcde".to_vec();
11086    matcher.history_start = 0;
11087    matcher.history_abs_start = 0;
11088    matcher.offset_hist = [1, 0, 0];
11089
11090    assert!(matcher.repcode_candidate(4, 1).is_none());
11091}
11092
11093#[cfg(all(feature = "std", target_arch = "x86_64"))]
11094#[test]
11095fn hash_mix_sse42_path_is_available_and_matches_accelerated_impl_when_supported() {
11096    use crate::encoding::fastpath::{self, FastpathKernel};
11097    if !is_x86_feature_detected!("sse4.2") {
11098        return;
11099    }
11100    let v = 0x0123_4567_89AB_CDEFu64;
11101    // SAFETY: feature check above guarantees SSE4.2 is available.
11102    let accelerated = unsafe { fastpath::sse42::hash_mix_u64(v) };
11103    // Dispatcher must resolve to SSE4.2 (or better) and produce the same mix.
11104    let dispatched = fastpath::dispatch_hash_mix_u64(v);
11105    let kernel = fastpath::select_kernel();
11106    if kernel == FastpathKernel::Sse42 {
11107        assert_eq!(dispatched, accelerated);
11108    } else {
11109        // AVX2 kernel uses the same CRC32 instruction under the hood.
11110        assert_eq!(dispatched, accelerated, "AVX2/SSE4.2 share CRC32 mix");
11111    }
11112}
11113
11114#[cfg(all(feature = "std", target_arch = "aarch64", target_endian = "little"))]
11115#[test]
11116fn hash_mix_crc_path_is_available_and_matches_accelerated_impl_when_supported() {
11117    use crate::encoding::fastpath;
11118    if !is_aarch64_feature_detected!("crc") {
11119        return;
11120    }
11121    let v = 0x0123_4567_89AB_CDEFu64;
11122    // SAFETY: feature check above guarantees CRC32 is available.
11123    let accelerated = unsafe { fastpath::neon::hash_mix_u64(v) };
11124    let dispatched = fastpath::dispatch_hash_mix_u64(v);
11125    assert_eq!(dispatched, accelerated);
11126}
11127
11128#[test]
11129fn hc_hash3_position_matches_hash3_formula() {
11130    let bytes = [b'a', b'b', b'c', b'd'];
11131    let read32 = u32::from_le_bytes(bytes);
11132    let expected = (((read32 << 8).wrapping_mul(HC_PRIME3BYTES)) >> (32 - HC3_HASH_LOG)) as usize;
11133    assert_eq!(
11134        super::match_table::storage::MatchTable::hash3_position(&bytes, HC3_HASH_LOG),
11135        expected
11136    );
11137}
11138
11139#[test]
11140fn hc_hash_position_matches_hash4_formula() {
11141    let mut hc = HcMatchGenerator::new(1 << 20);
11142    hc.configure(HC_CONFIG, super::strategy::StrategyTag::Lazy, 22);
11143    let bytes = [b'a', b'b', b'c', b'd'];
11144    let read32 = u32::from_le_bytes(bytes);
11145    let expected = ((read32.wrapping_mul(HC_PRIME4BYTES)) >> (32 - hc.table.hash_log)) as usize;
11146    assert_eq!(hc.table.hash_position(&bytes), expected);
11147}
11148
11149#[test]
11150fn btultra2_main_hash_uses_hash4_formula() {
11151    let mut hc = HcMatchGenerator::new(1 << 20);
11152    hc.configure(
11153        BTULTRA2_HC_CONFIG_L22,
11154        super::strategy::StrategyTag::BtUltra2,
11155        27,
11156    );
11157    let bytes = [b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h'];
11158    let read32 = u32::from_le_bytes(bytes[..4].try_into().unwrap());
11159    let expected = ((read32.wrapping_mul(HC_PRIME4BYTES)) >> (32 - hc.table.hash_log)) as usize;
11160    let actual = super::match_table::storage::MatchTable::hash_position_with_mls(
11161        &bytes,
11162        hc.table.hash_log,
11163        super::bt::BtMatcher::HASH_MLS,
11164    );
11165    assert_eq!(actual, expected);
11166}
11167
11168#[test]
11169fn row_candidate_returns_none_when_abs_pos_near_end_of_history() {
11170    let mut matcher = RowMatchGenerator::new(1 << 22);
11171    matcher.configure(ROW_CONFIG);
11172    // One byte short of the accept floor: from abs_pos 0 there are fewer
11173    // than `ROW_MIN_MATCH_LEN` bytes left, so the length gate in
11174    // `row_candidate` must short-circuit to `None` before touching the
11175    // (here unbuilt) row tables.
11176    matcher.history = alloc::vec![b'a'; ROW_MIN_MATCH_LEN - 1];
11177    matcher.history_start = 0;
11178    matcher.history_abs_start = 0;
11179
11180    assert!(matcher.row_candidate(0, 0).is_none());
11181}
11182
11183#[test]
11184fn hc_chain_candidates_returns_sentinels_for_short_suffix() {
11185    let mut hc = HcMatchGenerator::new(32);
11186    hc.table.history = b"abc".to_vec();
11187    hc.table.history_start = 0;
11188    hc.table.history_abs_start = 0;
11189    hc.table.ensure_tables();
11190
11191    let candidates = hc.hc.chain_candidates(&hc.table, 0);
11192    assert!(candidates.iter().all(|&pos| pos == usize::MAX));
11193}
11194
11195#[test]
11196fn hc_reset_advances_floor_past_prior_frame_entries() {
11197    use super::match_table::storage::MatchTable;
11198    let mut hc = HcMatchGenerator::new(32);
11199    hc.table.add_data(b"abcdeabcde".to_vec(), |_| {});
11200    hc.table.ensure_tables();
11201    // Populate real hash / chain entries for the first frame's positions.
11202    hc.table.insert_positions(0, 6);
11203    let prev_end = hc.table.history_abs_end();
11204    assert_eq!(prev_end, 10);
11205    assert!(hc.table.hash_table.iter().any(|&v| v != HC_EMPTY));
11206
11207    hc.reset(|_| {});
11208
11209    // Behavioural contract: the previous frame's entries are no longer
11210    // matchable. `reset` advances the floor past every prior position
11211    // instead of zeroing the tables, so each populated slot now decodes
11212    // to an absolute position strictly below `history_abs_start` and is
11213    // rejected by the `window_low` guard before any byte is read.
11214    assert_eq!(hc.table.history_abs_start, prev_end);
11215    for &slot in hc.table.hash_table.iter() {
11216        if let Some(candidate_abs) =
11217            MatchTable::stored_abs_position_fast(slot, hc.table.position_base, hc.table.index_shift)
11218        {
11219            assert!(
11220                candidate_abs < hc.table.history_abs_start,
11221                "a prior-frame entry must resolve below the advanced floor"
11222            );
11223        }
11224    }
11225}
11226
11227#[test]
11228fn hc_reset_full_zeroes_when_floor_would_cross_ceiling() {
11229    use super::match_table::storage::REBASE_RESET_FLOOR_CEILING;
11230    let mut hc = HcMatchGenerator::new(32);
11231    hc.table.add_data(b"abcdeabcde".to_vec(), |_| {});
11232    hc.table.ensure_tables();
11233    hc.table.hash_table.fill(123);
11234    hc.table.chain_table.fill(456);
11235    // Push the would-be floor (`history_abs_end`) past the ceiling so
11236    // `reset` takes the bounded fallback: rewind to the origin and zero
11237    // the tables, keeping the absolute cursor from climbing toward
11238    // `usize::MAX` on 32-bit targets.
11239    hc.table.history_abs_start = REBASE_RESET_FLOOR_CEILING;
11240
11241    hc.reset(|_| {});
11242
11243    assert_eq!(hc.table.history_abs_start, 0);
11244    assert_eq!(hc.table.position_base, 0);
11245    assert!(hc.table.hash_table.iter().all(|&v| v == HC_EMPTY));
11246    assert!(hc.table.chain_table.iter().all(|&v| v == HC_EMPTY));
11247}
11248
11249#[test]
11250fn hc_start_matching_returns_early_for_empty_current_block() {
11251    let mut hc = HcMatchGenerator::new(32);
11252    hc.table.add_data(Vec::new(), |_| {});
11253    let mut called = false;
11254    hc.start_matching(|_| called = true);
11255    assert!(!called, "empty current block should not emit sequences");
11256}
11257
11258#[cfg(test)]
11259fn deterministic_high_entropy_bytes(seed: u64, len: usize) -> Vec<u8> {
11260    let mut out = Vec::with_capacity(len);
11261    let mut state = seed;
11262    for _ in 0..len {
11263        state ^= state << 13;
11264        state ^= state >> 7;
11265        state ^= state << 17;
11266        out.push((state >> 40) as u8);
11267    }
11268    out
11269}
11270
11271#[cfg(feature = "bench_internals")]
11272pub(crate) fn level22_block_ranges(data: &[u8]) -> Vec<(usize, usize)> {
11273    let mut ranges = Vec::new();
11274    let mut cursor = 0usize;
11275    let mut savings = 0i64;
11276    while cursor < data.len() {
11277        let remaining = data.len() - cursor;
11278        let candidate_len = remaining.min(super::cost_model::HC_BLOCKSIZE_MAX);
11279        let block_len = crate::encoding::frame_compressor::optimal_block_size(
11280            CompressionLevel::Level(22),
11281            &data[cursor..cursor + candidate_len],
11282            remaining,
11283            super::cost_model::HC_BLOCKSIZE_MAX,
11284            savings,
11285        )
11286        .min(candidate_len)
11287        .max(1);
11288        ranges.push((cursor, block_len));
11289        cursor += block_len;
11290        // The exact upstream zstd gate uses compressed-size savings. For this corpus
11291        // parity harness, after the first full block has compressed, savings is
11292        // sufficient to authorize the same pre-block splitter path.
11293        if cursor >= super::cost_model::HC_BLOCKSIZE_MAX {
11294            savings = 3;
11295        }
11296    }
11297    ranges
11298}
11299
11300#[cfg(feature = "bench_internals")]
11301fn merge_block_delimiters(sequences: Vec<(usize, usize, usize)>) -> Vec<(usize, usize, usize)> {
11302    let mut out = Vec::with_capacity(sequences.len());
11303    let mut pending_lits = 0usize;
11304    for (lit_len, offset, match_len) in sequences {
11305        if offset == 0 && match_len == 0 {
11306            pending_lits = pending_lits.saturating_add(lit_len);
11307            continue;
11308        }
11309        out.push((lit_len.saturating_add(pending_lits), offset, match_len));
11310        pending_lits = 0;
11311    }
11312    if pending_lits > 0 {
11313        out.push((pending_lits, 0, 0));
11314    }
11315    out
11316}
11317
11318/// White-box capture of the level-22 sequence stream (literal-length,
11319/// offset, match-length triples) the match generator emits for `data`,
11320/// with block-delimiter pseudo-sequences merged into the following
11321/// triple's literal run. Pure Rust; the C-conformance comparison that
11322/// consumes it lives in the `ffi-bench` crate.
11323#[cfg(feature = "bench_internals")]
11324pub(crate) fn collect_level22_sequences(data: &[u8]) -> Vec<(usize, usize, usize)> {
11325    merge_block_delimiters(collect_level22_sequences_with_delimiters(data))
11326        .into_iter()
11327        .filter(|(_, offset, match_len)| *offset != 0 || *match_len != 0)
11328        .collect()
11329}
11330
11331#[cfg(feature = "bench_internals")]
11332fn collect_level22_sequences_with_delimiters(data: &[u8]) -> Vec<(usize, usize, usize)> {
11333    let mut driver = MatchGeneratorDriver::new(super::cost_model::HC_BLOCKSIZE_MAX, 1);
11334    driver.set_source_size_hint(data.len() as u64);
11335    driver.reset(CompressionLevel::Level(22));
11336
11337    let mut sequences = Vec::new();
11338    for (chunk_start, chunk_len) in level22_block_ranges(data) {
11339        let chunk = &data[chunk_start..chunk_start + chunk_len];
11340        let mut space = driver.get_next_space();
11341        space[..chunk.len()].copy_from_slice(chunk);
11342        space.truncate(chunk.len());
11343        driver.commit_space(space);
11344        driver.start_matching(|seq| {
11345            let entry = match seq {
11346                Sequence::Literals { literals } => (literals.len(), 0usize, 0usize),
11347                Sequence::Triple {
11348                    literals,
11349                    offset,
11350                    match_len,
11351                } => (literals.len(), offset, match_len),
11352            };
11353            sequences.push(entry);
11354        });
11355    }
11356    sequences
11357}
11358
11359#[test]
11360fn hc_sparse_skip_matching_preserves_tail_cross_block_match() {
11361    let mut matcher = HcMatchGenerator::new(1 << 22);
11362    let tail = b"Qz9kLm2Rp";
11363    let mut first = deterministic_high_entropy_bytes(0xD1B5_4A32_9C77_0E19, 4096);
11364    let tail_start = first.len() - tail.len();
11365    first[tail_start..].copy_from_slice(tail);
11366    matcher.table.add_data(first.clone(), |_| {});
11367    matcher.skip_matching(Some(true));
11368
11369    let mut second = tail.to_vec();
11370    second.extend_from_slice(b"after-tail-literals");
11371    matcher.table.add_data(second, |_| {});
11372
11373    let mut first_sequence = None;
11374    matcher.start_matching(|seq| {
11375        if first_sequence.is_some() {
11376            return;
11377        }
11378        first_sequence = Some(match seq {
11379            Sequence::Literals { literals } => (literals.len(), 0usize, 0usize),
11380            Sequence::Triple {
11381                literals,
11382                offset,
11383                match_len,
11384            } => (literals.len(), offset, match_len),
11385        });
11386    });
11387
11388    let (literals_len, offset, match_len) =
11389        first_sequence.expect("expected at least one sequence after sparse skip");
11390    assert_eq!(
11391        literals_len, 0,
11392        "first sequence should start at block boundary"
11393    );
11394    assert_eq!(
11395        offset,
11396        tail.len(),
11397        "first match should reference previous tail"
11398    );
11399    assert!(
11400        match_len >= tail.len(),
11401        "tail-aligned cross-block match must be preserved"
11402    );
11403}
11404
11405#[test]
11406fn btultra2_sparse_skip_matching_preserves_tail_cross_block_match() {
11407    let mut matcher = HcMatchGenerator::new(1 << 20);
11408    matcher.configure(
11409        BTULTRA2_HC_CONFIG_L22,
11410        super::strategy::StrategyTag::BtUltra2,
11411        20,
11412    );
11413    let tail = b"Bt9kLm2Rp";
11414    let mut first = deterministic_high_entropy_bytes(0xA9C3_7F21_D4E8_510B, 4096);
11415    let tail_start = first.len() - tail.len();
11416    first[tail_start..].copy_from_slice(tail);
11417    matcher.table.add_data(first, |_| {});
11418    matcher.skip_matching(Some(true));
11419
11420    let mut second = tail.to_vec();
11421    second.extend_from_slice(b"after-tail-literals");
11422    matcher.table.add_data(second, |_| {});
11423
11424    let mut first_sequence = None;
11425    matcher.start_matching(|seq| {
11426        if first_sequence.is_some() {
11427            return;
11428        }
11429        first_sequence = Some(match seq {
11430            Sequence::Literals { literals } => (literals.len(), 0usize, 0usize),
11431            Sequence::Triple {
11432                literals,
11433                offset,
11434                match_len,
11435            } => (literals.len(), offset, match_len),
11436        });
11437    });
11438
11439    let (literals_len, offset, match_len) =
11440        first_sequence.expect("expected at least one sequence after sparse BT skip");
11441    assert_eq!(
11442        literals_len, 0,
11443        "BT sparse skip should preserve an immediate boundary match"
11444    );
11445    assert_eq!(
11446        offset,
11447        tail.len(),
11448        "first BT match should reference previous tail"
11449    );
11450    assert!(
11451        match_len >= tail.len(),
11452        "BT sparse skip must seed the dense tail for cross-block matching"
11453    );
11454}
11455
11456#[test]
11457fn hc_sparse_skip_matching_does_not_reinsert_sparse_tail_positions() {
11458    let mut matcher = HcMatchGenerator::new(1 << 22);
11459    let first = deterministic_high_entropy_bytes(0xC2B2_AE3D_27D4_EB4F, 4096);
11460    matcher.table.add_data(first.clone(), |_| {});
11461    matcher.skip_matching(Some(true));
11462
11463    let current_len = first.len();
11464    let current_abs_start =
11465        matcher.table.history_abs_start + matcher.table.window_size - current_len;
11466    let current_abs_end = current_abs_start + current_len;
11467    let dense_tail = HC_MIN_MATCH_LEN + INCOMPRESSIBLE_SKIP_STEP;
11468    let tail_start = current_abs_end
11469        .saturating_sub(dense_tail)
11470        .max(matcher.table.history_abs_start)
11471        .max(current_abs_start);
11472
11473    let overlap_pos = (tail_start..current_abs_end)
11474        .find(|&pos| (pos - current_abs_start).is_multiple_of(INCOMPRESSIBLE_SKIP_STEP))
11475        .expect("fixture should contain at least one sparse-grid overlap in dense tail");
11476
11477    let rel = matcher
11478        .table
11479        .relative_position(overlap_pos)
11480        .expect("overlap position should be representable as relative position");
11481    let chain_idx = rel as usize & ((1 << matcher.table.chain_log) - 1);
11482    assert_ne!(
11483        matcher.table.chain_table[chain_idx],
11484        rel + 1,
11485        "sparse-grid tail positions must not be reinserted (self-loop chain entry)"
11486    );
11487}
11488
11489#[test]
11490fn hc_compact_history_drains_when_threshold_crossed() {
11491    let mut hc = HcMatchGenerator::new(8);
11492    hc.table.history = b"abcdefghijklmnopqrstuvwxyz".to_vec();
11493    hc.table.history_start = 16;
11494    hc.table.compact_history();
11495    assert_eq!(hc.table.history_start, 0);
11496    assert_eq!(hc.table.history, b"qrstuvwxyz");
11497}
11498
11499#[test]
11500fn hc_insert_position_no_rebase_returns_when_relative_pos_unavailable() {
11501    let mut hc = HcMatchGenerator::new(32);
11502    hc.table.history = b"abcdefghijklmnop".to_vec();
11503    hc.table.history_abs_start = 0;
11504    hc.table.position_base = 1;
11505    hc.table.ensure_tables();
11506    let before_hash = hc.table.hash_table.clone();
11507    let before_chain = hc.table.chain_table.clone();
11508
11509    hc.table.insert_position_no_rebase(0);
11510
11511    assert_eq!(hc.table.hash_table, before_hash);
11512    assert_eq!(hc.table.chain_table, before_chain);
11513}
11514
11515#[test]
11516fn hc_insert_positions_advances_next_to_update3_for_contiguous_range() {
11517    let mut hc = HcMatchGenerator::new(64);
11518    hc.table.history = b"abcdefghijklmnopqrstuvwxyz".to_vec();
11519    hc.table.history_start = 0;
11520    hc.table.history_abs_start = 0;
11521    hc.table.position_base = 0;
11522    hc.table.ensure_tables();
11523    hc.table.next_to_update3 = 0;
11524
11525    hc.table.insert_positions(0, 9);
11526
11527    assert_eq!(
11528        hc.table.next_to_update3, 9,
11529        "contiguous insert_positions should advance hash3 update cursor"
11530    );
11531}
11532
11533#[test]
11534fn hc_insert_positions_with_step_keeps_next_to_update3_cursor_for_sparse_ranges() {
11535    let mut hc = HcMatchGenerator::new(64);
11536    hc.table.history = b"abcdefghijklmnopqrstuvwxyz".to_vec();
11537    hc.table.history_start = 0;
11538    hc.table.history_abs_start = 0;
11539    hc.table.position_base = 0;
11540    hc.table.ensure_tables();
11541    hc.table.next_to_update3 = 0;
11542
11543    hc.table.insert_positions_with_step(0, 16, 4);
11544
11545    assert_eq!(
11546        hc.table.next_to_update3, 0,
11547        "sparse insert_positions_with_step must not mark skipped positions as hash3-updated"
11548    );
11549}
11550
11551#[cfg(any())]
11552// disabled: tests legacy SuffixStore behavior incompatible with upstream zstd-shape kernel's HASH_READ_SIZE geometry
11553#[test]
11554fn prime_with_dictionary_budget_shrinks_after_simple_eviction() {
11555    let mut driver = MatchGeneratorDriver::new(8, 1);
11556    driver.reset(CompressionLevel::Fastest);
11557    // Use a small live window so dictionary-primed slices are evicted
11558    // quickly and budget retirement can be asserted deterministically.
11559    driver.simple_mut().max_window_size = 8;
11560    driver.reported_window_size = 8;
11561
11562    let base_window = driver.simple_mut().max_window_size;
11563    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
11564    assert_eq!(driver.simple_mut().max_window_size, base_window + 24);
11565
11566    for block in [b"AAAAAAAA", b"BBBBBBBB"] {
11567        let mut space = driver.get_next_space();
11568        space.clear();
11569        space.extend_from_slice(block);
11570        driver.commit_space(space);
11571        driver.skip_matching_with_hint(None);
11572    }
11573
11574    assert_eq!(
11575        driver.dictionary_retained_budget, 0,
11576        "dictionary budget should be fully retired once primed dict slices are evicted"
11577    );
11578    assert_eq!(
11579        driver.simple_mut().max_window_size,
11580        base_window,
11581        "retired dictionary budget must not remain reusable for live history"
11582    );
11583}
11584
11585#[test]
11586fn prime_with_dictionary_budget_shrinks_after_dfast_eviction() {
11587    let mut driver = MatchGeneratorDriver::new(8, 1);
11588    driver.reset(CompressionLevel::Level(3));
11589    // Use a small live window in this regression so dictionary-primed slices are
11590    // evicted quickly and budget retirement can be asserted deterministically.
11591    driver.dfast_matcher_mut().max_window_size = 8;
11592    driver.reported_window_size = 8;
11593
11594    let base_window = driver.dfast_matcher().max_window_size;
11595    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
11596    assert_eq!(driver.dfast_matcher().max_window_size, base_window + 24);
11597
11598    for block in [b"AAAAAAAA", b"BBBBBBBB"] {
11599        let mut space = driver.get_next_space();
11600        space.clear();
11601        space.extend_from_slice(block);
11602        driver.commit_space(space);
11603        driver.skip_matching_with_hint(None);
11604    }
11605
11606    assert_eq!(
11607        driver.dictionary_retained_budget, 0,
11608        "dictionary budget should be fully retired once primed dict slices are evicted"
11609    );
11610    assert_eq!(
11611        driver.dfast_matcher().max_window_size,
11612        base_window,
11613        "retired dictionary budget must not remain reusable for live history"
11614    );
11615}
11616
11617#[test]
11618fn hc_prime_with_dictionary_preserves_history_for_first_full_block() {
11619    let mut driver = MatchGeneratorDriver::new(8, 1);
11620    // Route onto HashChain explicitly — `Better` resolves to the Row
11621    // backend in production, and this test pins HC dict-prime behaviour.
11622    driver.reset_on_hc_lazy(CompressionLevel::Better);
11623
11624    driver.prime_with_dictionary(b"abcdefgh", [1, 4, 8]);
11625
11626    let mut space = driver.get_next_space();
11627    space.clear();
11628    // Repeat the dictionary content so the HC matcher can find it.
11629    // HC_MIN_MATCH_LEN is 5, so an 8-byte match is well above threshold.
11630    space.extend_from_slice(b"abcdefgh");
11631    driver.commit_space(space);
11632
11633    let mut saw_match = false;
11634    driver.start_matching(|seq| {
11635        if let Sequence::Triple {
11636            literals,
11637            offset,
11638            match_len,
11639        } = seq
11640            && literals.is_empty()
11641            && offset == 8
11642            && match_len >= HC_MIN_MATCH_LEN
11643        {
11644            saw_match = true;
11645        }
11646    });
11647
11648    assert!(
11649        saw_match,
11650        "hash-chain backend should match dictionary-primed history in first full block"
11651    );
11652}
11653
11654#[test]
11655fn prime_with_dictionary_budget_shrinks_after_hc_eviction() {
11656    let mut driver = MatchGeneratorDriver::new(8, 1);
11657    driver.reset_on_hc_lazy(CompressionLevel::Better);
11658    // Use a small live window so dictionary-primed slices are evicted quickly.
11659    driver.hc_matcher_mut().table.max_window_size = 8;
11660    driver.reported_window_size = 8;
11661
11662    let base_window = driver.hc_matcher().table.max_window_size;
11663    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
11664    assert_eq!(driver.hc_matcher().table.max_window_size, base_window + 24);
11665
11666    for block in [b"AAAAAAAA", b"BBBBBBBB"] {
11667        let mut space = driver.get_next_space();
11668        space.clear();
11669        space.extend_from_slice(block);
11670        driver.commit_space(space);
11671        driver.skip_matching_with_hint(None);
11672    }
11673
11674    assert_eq!(
11675        driver.dictionary_retained_budget, 0,
11676        "dictionary budget should be fully retired once primed dict slices are evicted"
11677    );
11678    assert_eq!(
11679        driver.hc_matcher().table.max_window_size,
11680        base_window,
11681        "retired dictionary budget must not remain reusable for live history"
11682    );
11683}
11684
11685#[test]
11686fn resident_reapply_restores_retained_dictionary_budget() {
11687    // A reused-dict frame that re-borrows the resident dictionary (skips the
11688    // re-prime) must restore the retained-dict budget the per-frame `reset`
11689    // cleared. The matcher's `reset` re-inflates `max_window_size` by the dict
11690    // region; without the restore the driver-level budget stays 0 and
11691    // `retire_dictionary_budget` never shrinks that inflated window as the dict
11692    // evicts. For the HashChain backend (whose `window_low` is measured against
11693    // `max_window_size`) that lets a post-eviction match exceed the frame
11694    // header's base window and emit an over-window offset.
11695    let mut driver = MatchGeneratorDriver::new(1 << 16, 1);
11696    let dict = b"abcdefghABCDEFGHijklmnopqrstuvwxyz0123456789";
11697    driver.set_dictionary_size_hint(dict.len());
11698    driver.reset_on_hc_lazy(CompressionLevel::Better);
11699    driver.prime_with_dictionary(dict, [1, 4, 8]);
11700    let base = driver.reported_window_size;
11701    assert!(
11702        driver.dictionary_retained_budget > 0,
11703        "the priming frame must retain a non-zero dict budget"
11704    );
11705
11706    // Second frame: the reset detects the resident dict and re-borrows it.
11707    driver.set_dictionary_size_hint(dict.len());
11708    driver.reset_on_hc_lazy(CompressionLevel::Better);
11709    assert!(
11710        driver.dictionary_is_resident(),
11711        "the second frame must re-borrow the resident dictionary"
11712    );
11713    assert_eq!(
11714        driver.dictionary_retained_budget, 0,
11715        "reset clears the retained-dict budget"
11716    );
11717    let inflated = driver.hc_matcher().table.max_window_size;
11718    assert!(
11719        inflated > base,
11720        "reset re-inflates the window by the resident dict region \
11721         (inflated={inflated}, base={base})"
11722    );
11723
11724    driver.reapply_resident_dictionary([1, 4, 8]);
11725    assert_eq!(
11726        driver.dictionary_retained_budget,
11727        inflated - base,
11728        "resident reapply must restore the retained-dict budget (= window \
11729         inflation) so the retire path can shrink the window as the dict evicts"
11730    );
11731}
11732
11733#[test]
11734fn hc_commit_without_eviction_retires_no_dictionary_budget() {
11735    // Regression: after the window<->history dedup, MatchTable::add_data
11736    // invokes its reuse_space callback for the *input* buffer (recycle),
11737    // not for evicted chunks. The HC arm of commit_space must therefore
11738    // derive eviction bytes from the window_size delta — counting the
11739    // callback argument as evicted would charge the whole committed block
11740    // as "evicted" and prematurely retire dictionary budget even when the
11741    // window is nowhere near full.
11742    let mut driver = MatchGeneratorDriver::new(8, 1);
11743    driver.reset_on_hc_lazy(CompressionLevel::Better);
11744    // A large live window so a small committed block evicts nothing.
11745    driver.hc_matcher_mut().table.max_window_size = 1 << 20;
11746    driver.reported_window_size = 1 << 20;
11747    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
11748    let budget_after_prime = driver.dictionary_retained_budget;
11749    assert!(
11750        budget_after_prime > 0,
11751        "priming must retain a non-zero dictionary budget"
11752    );
11753
11754    let mut space = driver.get_next_space();
11755    space.clear();
11756    space.extend_from_slice(b"AAAAAAAA");
11757    driver.commit_space(space);
11758    driver.skip_matching_with_hint(None);
11759
11760    assert_eq!(
11761        driver.dictionary_retained_budget, budget_after_prime,
11762        "a commit that evicts nothing must retire no dictionary budget"
11763    );
11764}
11765
11766#[test]
11767fn row_commit_without_eviction_retires_no_dictionary_budget() {
11768    // Regression for the Row arm of commit_space after the window ->
11769    // chunk_lens migration: RowMatchGenerator::add_data now invokes its
11770    // reuse_space callback for the *input* buffer (per-commit recycle),
11771    // not for evicted chunks. The Row arm must derive eviction bytes from
11772    // the window_size delta like the Dfast / HashChain arms — counting the
11773    // callback argument as evicted charges the whole committed block as
11774    // "evicted" and prematurely retires dictionary budget even when the
11775    // window is nowhere near full.
11776    let mut driver = MatchGeneratorDriver::new(8, 1);
11777    driver.reset(CompressionLevel::Level(5));
11778    assert!(matches!(driver.storage, MatcherStorage::Row(_)));
11779    // A large live window so a small committed block evicts nothing.
11780    driver.row_matcher_mut().max_window_size = 1 << 20;
11781    driver.reported_window_size = 1 << 20;
11782    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
11783    let budget_after_prime = driver.dictionary_retained_budget;
11784    assert!(
11785        budget_after_prime > 0,
11786        "priming must retain a non-zero dictionary budget"
11787    );
11788
11789    let mut space = driver.get_next_space();
11790    space.clear();
11791    space.extend_from_slice(b"AAAAAAAA");
11792    driver.commit_space(space);
11793    driver.skip_matching_with_hint(None);
11794
11795    assert_eq!(
11796        driver.dictionary_retained_budget, budget_after_prime,
11797        "a Row commit that evicts nothing must retire no dictionary budget"
11798    );
11799}
11800
11801#[test]
11802fn hc_rebases_positions_after_u32_boundary() {
11803    let mut matcher = HcMatchGenerator::new(64);
11804    matcher.table.add_data(b"abcdeabcdeabcde".to_vec(), |_| {});
11805    matcher.table.ensure_tables();
11806    matcher.table.position_base = 0;
11807    let history_abs_start: usize = match (u64::from(u32::MAX) + 64).try_into() {
11808        Ok(value) => value,
11809        Err(_) => return,
11810    };
11811    // Simulate a long-running stream where absolute history positions crossed
11812    // the u32 range. Before #51 this disabled HC inserts entirely.
11813    matcher.table.history_abs_start = history_abs_start;
11814    matcher.skip_matching(None);
11815    assert_eq!(
11816        matcher.table.position_base, matcher.table.history_abs_start,
11817        "rebase should anchor to the oldest live absolute position"
11818    );
11819
11820    assert!(
11821        matcher
11822            .table
11823            .hash_table
11824            .iter()
11825            .any(|entry| *entry != HC_EMPTY),
11826        "HC hash table should still be populated after crossing u32 boundary"
11827    );
11828
11829    // Verify rebasing preserves candidate lookup, not just table population.
11830    let abs_pos = matcher.table.history_abs_start + 10;
11831    let candidates = matcher.hc.chain_candidates(&matcher.table, abs_pos);
11832    assert!(
11833        candidates.iter().any(|candidate| *candidate != usize::MAX),
11834        "chain_candidates should return valid matches after rebase"
11835    );
11836}
11837
11838// 64-bit only: the >4 GiB absolute cursor this test fabricates cannot exist on
11839// a 32-bit target (usize == u32 can't address that much), and setting
11840// `history_abs_start` near `u32::MAX` there overflows `usize` in the
11841// `check_stream_abs_headroom` guard before the rebase path is reached. Mirrors
11842// the `try_into()` early-return guard on `hc_rebases_positions_after_u32_boundary`.
11843#[cfg(target_pointer_width = "64")]
11844#[test]
11845fn row_rebases_positions_after_u32_boundary() {
11846    // Row stores absolute match positions as u32. On a long stream the
11847    // cumulative absolute cursor crosses the u32 range even while the live
11848    // window stays bounded; `add_data` must rebase the coordinate origin
11849    // down to the oldest live byte instead of asserting. Before the rebase
11850    // landed this panicked on the `< u32::MAX` assertion, dropping valid
11851    // long Row-backed frames.
11852    let mut m = RowMatchGenerator::new(64);
11853    m.add_data(b"abcdeabcdeabcde".to_vec(), |_| {});
11854
11855    // Simulate ~4 GiB of stream behind a bounded window: the live bytes now
11856    // sit just under the u32 absolute ceiling.
11857    let near_ceiling = (u32::MAX as usize) - 16;
11858    m.history_abs_start = near_ceiling;
11859
11860    // The next commit would push a u32 position past the ceiling; add_data
11861    // must rebase the origin rather than panic.
11862    m.add_data(b"fghij".to_vec(), |_| {});
11863
11864    assert!(
11865        m.history_abs_start < near_ceiling,
11866        "add_data must rebase the absolute origin down when the cursor nears \
11867         u32::MAX (got {})",
11868        m.history_abs_start
11869    );
11870    assert!(
11871        (m.history_abs_start + m.window_size) < u32::MAX as usize,
11872        "after rebase the live window must fit below the u32 position ceiling"
11873    );
11874}
11875
11876#[test]
11877fn hc_rebase_rebuilds_only_inserted_prefix() {
11878    let mut matcher = HcMatchGenerator::new(64);
11879    matcher.table.add_data(b"abcdeabcdeabcde".to_vec(), |_| {});
11880    matcher.table.ensure_tables();
11881    matcher.table.position_base = 0;
11882    let history_abs_start: usize = match (u64::from(u32::MAX) + 64).try_into() {
11883        Ok(value) => value,
11884        Err(_) => return,
11885    };
11886    matcher.table.history_abs_start = history_abs_start;
11887    let abs_pos = matcher.table.history_abs_start + 6;
11888
11889    let mut expected = HcMatchGenerator::new(64);
11890    expected.table.add_data(b"abcdeabcdeabcde".to_vec(), |_| {});
11891    expected.table.ensure_tables();
11892    expected.table.history_abs_start = history_abs_start;
11893    expected.table.position_base = expected.table.history_abs_start;
11894    expected.table.hash_table.fill(HC_EMPTY);
11895    expected.table.chain_table.fill(HC_EMPTY);
11896    for pos in expected.table.history_abs_start..abs_pos {
11897        expected.table.insert_position_no_rebase(pos);
11898    }
11899
11900    matcher.table.maybe_rebase_positions(abs_pos);
11901
11902    assert_eq!(
11903        matcher.table.position_base, matcher.table.history_abs_start,
11904        "rebase should still anchor to the oldest live absolute position"
11905    );
11906    assert_eq!(
11907        matcher.table.hash_table, expected.table.hash_table,
11908        "rebase must rebuild only positions already inserted before abs_pos"
11909    );
11910    assert_eq!(
11911        matcher.table.chain_table, expected.table.chain_table,
11912        "future positions must not be pre-seeded into HC chains during rebase"
11913    );
11914}
11915
11916#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
11917#[test]
11918fn suffix_store_with_single_slot_does_not_panic_on_keying() {
11919    let mut suffixes = SuffixStore::with_capacity(1);
11920    suffixes.insert(b"abcde", 0);
11921    assert!(suffixes.contains_key(b"abcde"));
11922    assert_eq!(suffixes.get(b"abcde"), Some(0));
11923}
11924
11925#[cfg(any())]
11926// disabled: hash_fill_step is a legacy MatchGenerator field; FastKernelMatcher walks stride=1 today
11927#[test]
11928fn fastest_reset_uses_interleaved_hash_fill_step() {
11929    let mut driver = MatchGeneratorDriver::new(32, 2);
11930
11931    driver.reset(CompressionLevel::Uncompressed);
11932    assert_eq!(driver.simple().hash_fill_step, 1);
11933
11934    driver.reset(CompressionLevel::Fastest);
11935    assert_eq!(driver.simple().hash_fill_step, FAST_HASH_FILL_STEP);
11936
11937    // Better uses the HashChain backend with lazy2; verify that the backend switch
11938    // happened and the lazy_depth is configured correctly.
11939    driver.reset(CompressionLevel::Better);
11940    assert_eq!(
11941        driver.active_backend(),
11942        super::strategy::BackendTag::HashChain
11943    );
11944    assert_eq!(driver.window_size(), (1u64 << 23));
11945    assert_eq!(driver.hc_matcher().hc.lazy_depth, 2);
11946}
11947
11948#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
11949#[test]
11950fn simple_matcher_updates_offset_history_after_emitting_match() {
11951    let mut matcher = MatchGenerator::new(64);
11952    matcher.add_data(
11953        b"abcdeabcdeabcde".to_vec(),
11954        SuffixStore::with_capacity(64),
11955        |_, _| {},
11956    );
11957
11958    assert!(matcher.next_sequence(|seq| {
11959        assert_eq!(
11960            seq,
11961            Sequence::Triple {
11962                literals: b"abcde",
11963                offset: 5,
11964                match_len: 10,
11965            }
11966        );
11967    }));
11968    assert_eq!(matcher.offset_hist, [5, 1, 4]);
11969}
11970
11971#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
11972#[test]
11973fn simple_matcher_zero_literal_repcode_checks_rep1_before_hash_lookup() {
11974    let mut matcher = MatchGenerator::new(64);
11975    matcher.add_data(
11976        b"abcdefghijabcdefghij".to_vec(),
11977        SuffixStore::with_capacity(64),
11978        |_, _| {},
11979    );
11980
11981    matcher.suffix_idx = 10;
11982    matcher.last_idx_in_sequence = 10;
11983    matcher.offset_hist = [99, 10, 4];
11984
11985    let candidate = matcher.repcode_candidate(&matcher.window.last().unwrap().data[10..], 0);
11986    assert_eq!(candidate, Some((10, 10)));
11987}
11988
11989#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
11990#[test]
11991fn simple_matcher_repcode_can_target_previous_window_entry() {
11992    let mut matcher = MatchGenerator::new(64);
11993    matcher.add_data(
11994        b"abcdefghij".to_vec(),
11995        SuffixStore::with_capacity(64),
11996        |_, _| {},
11997    );
11998    matcher.skip_matching();
11999    matcher.add_data(
12000        b"abcdefghij".to_vec(),
12001        SuffixStore::with_capacity(64),
12002        |_, _| {},
12003    );
12004
12005    matcher.offset_hist = [99, 10, 4];
12006
12007    let candidate = matcher.repcode_candidate(&matcher.window.last().unwrap().data, 0);
12008    assert_eq!(candidate, Some((10, 10)));
12009}
12010
12011#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
12012#[test]
12013fn simple_matcher_zero_literal_repcode_checks_rep2() {
12014    let mut matcher = MatchGenerator::new(64);
12015    matcher.add_data(
12016        b"abcdefghijabcdefghij".to_vec(),
12017        SuffixStore::with_capacity(64),
12018        |_, _| {},
12019    );
12020    matcher.suffix_idx = 10;
12021    matcher.last_idx_in_sequence = 10;
12022    // rep1=4 does not match at idx 10, rep2=10 does.
12023    matcher.offset_hist = [99, 4, 10];
12024
12025    let candidate = matcher.repcode_candidate(&matcher.window.last().unwrap().data[10..], 0);
12026    assert_eq!(candidate, Some((10, 10)));
12027}
12028
12029#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
12030#[test]
12031fn simple_matcher_zero_literal_repcode_checks_rep0_minus1() {
12032    let mut matcher = MatchGenerator::new(64);
12033    matcher.add_data(
12034        b"abcdefghijabcdefghij".to_vec(),
12035        SuffixStore::with_capacity(64),
12036        |_, _| {},
12037    );
12038    matcher.suffix_idx = 10;
12039    matcher.last_idx_in_sequence = 10;
12040    // rep1=4 and rep2=99 do not match; rep0-1 == 10 does.
12041    matcher.offset_hist = [11, 4, 99];
12042
12043    let candidate = matcher.repcode_candidate(&matcher.window.last().unwrap().data[10..], 0);
12044    assert_eq!(candidate, Some((10, 10)));
12045}
12046
12047#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
12048#[test]
12049fn simple_matcher_repcode_rejects_offsets_beyond_searchable_prefix() {
12050    let mut matcher = MatchGenerator::new(64);
12051    matcher.add_data(
12052        b"abcdefghij".to_vec(),
12053        SuffixStore::with_capacity(64),
12054        |_, _| {},
12055    );
12056    matcher.skip_matching();
12057    matcher.add_data(
12058        b"klmnopqrst".to_vec(),
12059        SuffixStore::with_capacity(64),
12060        |_, _| {},
12061    );
12062    matcher.suffix_idx = 3;
12063
12064    let candidate = matcher.offset_match_len(14, &matcher.window.last().unwrap().data[3..]);
12065    assert_eq!(candidate, None);
12066}
12067
12068#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
12069#[test]
12070fn simple_matcher_skip_matching_seeds_every_position_even_with_fast_step() {
12071    let mut matcher = MatchGenerator::new(64);
12072    matcher.hash_fill_step = FAST_HASH_FILL_STEP;
12073    matcher.add_data(
12074        b"abcdefghijklmnop".to_vec(),
12075        SuffixStore::with_capacity(64),
12076        |_, _| {},
12077    );
12078    matcher.skip_matching();
12079    matcher.add_data(b"bcdef".to_vec(), SuffixStore::with_capacity(64), |_, _| {});
12080
12081    assert!(matcher.next_sequence(|seq| {
12082        assert_eq!(
12083            seq,
12084            Sequence::Triple {
12085                literals: b"",
12086                offset: 15,
12087                match_len: 5,
12088            }
12089        );
12090    }));
12091    assert!(!matcher.next_sequence(|_| {}));
12092}
12093
12094#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
12095#[test]
12096fn simple_matcher_skip_matching_with_incompressible_hint_uses_sparse_prefix() {
12097    let mut matcher = MatchGenerator::new(128);
12098    let first = b"abcdefghijklmnopqrstuvwxyz012345".to_vec();
12099    let sparse_probe = first[3..3 + MIN_MATCH_LEN].to_vec();
12100    let tail_start = first.len() - MIN_MATCH_LEN;
12101    let tail_probe = first[tail_start..tail_start + MIN_MATCH_LEN].to_vec();
12102    matcher.add_data(first, SuffixStore::with_capacity(256), |_, _| {});
12103
12104    matcher.skip_matching_with_hint(Some(true));
12105
12106    // Observable behavior check: sparse-prefix probe should not immediately match.
12107    matcher.add_data(sparse_probe, SuffixStore::with_capacity(256), |_, _| {});
12108    let mut sparse_first_is_literals = None;
12109    assert!(matcher.next_sequence(|seq| {
12110        if sparse_first_is_literals.is_none() {
12111            sparse_first_is_literals = Some(matches!(seq, Sequence::Literals { .. }));
12112        }
12113    }));
12114    assert!(
12115        sparse_first_is_literals.unwrap_or(false),
12116        "sparse-start probe should not produce an immediate match"
12117    );
12118
12119    // Dense tail remains indexed for cross-block boundary matching.
12120    let mut matcher = MatchGenerator::new(128);
12121    matcher.add_data(
12122        b"abcdefghijklmnopqrstuvwxyz012345".to_vec(),
12123        SuffixStore::with_capacity(256),
12124        |_, _| {},
12125    );
12126    matcher.skip_matching_with_hint(Some(true));
12127    matcher.add_data(tail_probe, SuffixStore::with_capacity(256), |_, _| {});
12128    let mut tail_first_is_immediate_match = None;
12129    assert!(matcher.next_sequence(|seq| {
12130        if tail_first_is_immediate_match.is_none() {
12131            tail_first_is_immediate_match =
12132                Some(matches!(seq, Sequence::Triple { literals, .. } if literals.is_empty()));
12133        }
12134    }));
12135    assert!(
12136        tail_first_is_immediate_match.unwrap_or(false),
12137        "dense tail probe should match immediately at block start"
12138    );
12139}
12140
12141#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
12142#[test]
12143fn simple_matcher_add_suffixes_till_backfills_last_searchable_anchor() {
12144    let mut matcher = MatchGenerator::new(64);
12145    matcher.hash_fill_step = FAST_HASH_FILL_STEP;
12146    matcher.add_data(
12147        b"01234abcde".to_vec(),
12148        SuffixStore::with_capacity(64),
12149        |_, _| {},
12150    );
12151    matcher.add_suffixes_till(10, FAST_HASH_FILL_STEP);
12152
12153    let last = matcher.window.last().unwrap();
12154    let tail = &last.data[5..10];
12155    assert_eq!(last.suffixes.get(tail), Some(5));
12156}
12157
12158#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
12159#[test]
12160fn simple_matcher_add_suffixes_till_skips_when_idx_below_min_match_len() {
12161    let mut matcher = MatchGenerator::new(128);
12162    matcher.hash_fill_step = FAST_HASH_FILL_STEP;
12163    matcher.add_data(
12164        b"abcdefghijklmnopqrstuvwxyz".to_vec(),
12165        SuffixStore::with_capacity(1 << 16),
12166        |_, _| {},
12167    );
12168
12169    matcher.add_suffixes_till(MIN_MATCH_LEN - 1, FAST_HASH_FILL_STEP);
12170
12171    let last = matcher.window.last().unwrap();
12172    let first_key = &last.data[..MIN_MATCH_LEN];
12173    assert_eq!(last.suffixes.get(first_key), None);
12174}
12175
12176#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
12177#[test]
12178fn simple_matcher_add_suffixes_till_fast_step_registers_interleaved_positions() {
12179    let mut matcher = MatchGenerator::new(128);
12180    matcher.hash_fill_step = FAST_HASH_FILL_STEP;
12181    matcher.add_data(
12182        b"abcdefghijklmnopqrstuvwxyz".to_vec(),
12183        SuffixStore::with_capacity(1 << 16),
12184        |_, _| {},
12185    );
12186
12187    matcher.add_suffixes_till(17, FAST_HASH_FILL_STEP);
12188
12189    let last = matcher.window.last().unwrap();
12190    for pos in [0usize, 3, 6, 9, 12] {
12191        let key = &last.data[pos..pos + MIN_MATCH_LEN];
12192        assert_eq!(
12193            last.suffixes.get(key),
12194            Some(pos),
12195            "expected interleaved suffix registration at pos {pos}"
12196        );
12197    }
12198}
12199
12200#[test]
12201fn dfast_skip_matching_handles_window_eviction() {
12202    let mut matcher = DfastMatchGenerator::new(16);
12203
12204    matcher.add_data(alloc::vec![1, 2, 3, 4, 5, 6], |_| {});
12205    matcher.skip_matching(None);
12206    matcher.add_data(alloc::vec![7, 8, 9, 10, 11, 12], |_| {});
12207    matcher.skip_matching(None);
12208    matcher.add_data(alloc::vec![7, 8, 9, 10, 11, 12], |_| {});
12209
12210    let mut reconstructed = alloc::vec![7, 8, 9, 10, 11, 12];
12211    matcher.start_matching(|seq| match seq {
12212        Sequence::Literals { literals } => reconstructed.extend_from_slice(literals),
12213        Sequence::Triple {
12214            literals,
12215            offset,
12216            match_len,
12217        } => {
12218            reconstructed.extend_from_slice(literals);
12219            let start = reconstructed.len() - offset;
12220            for i in 0..match_len {
12221                let byte = reconstructed[start + i];
12222                reconstructed.push(byte);
12223            }
12224        }
12225    });
12226
12227    assert_eq!(reconstructed, [7, 8, 9, 10, 11, 12, 7, 8, 9, 10, 11, 12]);
12228}
12229
12230#[test]
12231fn dfast_add_data_callback_reports_evicted_len_not_capacity() {
12232    let mut matcher = DfastMatchGenerator::new(8);
12233
12234    let mut first = Vec::with_capacity(64);
12235    first.extend_from_slice(b"abcdefgh");
12236    matcher.add_data(first, |_| {});
12237
12238    let mut second = Vec::with_capacity(64);
12239    second.extend_from_slice(b"ijklmnop");
12240
12241    let mut observed_evicted_len = None;
12242    matcher.add_data(second, |data| {
12243        observed_evicted_len = Some(data.len());
12244    });
12245
12246    assert_eq!(
12247        observed_evicted_len,
12248        Some(8),
12249        "eviction callback must report evicted byte length, not backing capacity"
12250    );
12251}
12252
12253/// Regression for the `commit_space` Dfast-branch eviction accounting bug
12254/// (CodeRabbit Critical on PR #146). Old code counted the INPUT buffer
12255/// length as `evicted_bytes` because Dfast's `add_data` callback receives
12256/// the input `Vec<u8>` for pool recycling (Dfast stores bytes in `history`,
12257/// not per-block Vecs). On the saturated-window 1:1 path the two coincide
12258/// so the previous test fixture passed by accident; this test forces the
12259/// divergent case where evicted != input by sequencing block lengths
12260/// `[4, 4, 5]` against `max_window_size = 10`:
12261///
12262///   * after 1st commit: `window_blocks = [4]`, `window_size = 4`
12263///   * after 2nd commit: `window_blocks = [4, 4]`, `window_size = 8`
12264///   * 3rd commit (5 bytes): `8 + 5 > 10` → pop one 4-byte block (evict=4),
12265///     then push 5 (window_size=9). Bug counts `5`, fix counts `4`.
12266///
12267/// The fix derives eviction from `window_size` delta + input length:
12268/// `evicted = pre + space_len - post`. Verified via the
12269/// `dictionary_retained_budget` observable: starting budget 100, after
12270/// the third commit (4 bytes actually evicted) the budget must read 96,
12271/// not 95.
12272/// Driver-path regression for the `commit_space` Dfast eviction accounting
12273/// bug. Exercises `MatchGeneratorDriver::commit_space` directly (not just
12274/// `DfastMatchGenerator::add_data`) so the assertion catches a future
12275/// regression that swaps the Dfast branch in `commit_space` back to
12276/// `evicted_bytes += data.len()` — the older draft of this regression
12277/// hand-recomputed the formula on the matcher and would pass either way.
12278///
12279/// Fixture: `max_window_size = 10`, commit sequence `[4, 4, 5]`. The
12280/// divergent case where the popped block (4 bytes) and the new input
12281/// (5 bytes) have different sizes:
12282///
12283///   * after commit `"abcd"` (4 B): window_blocks=[4], ws=4
12284///   * after commit `"efgh"` (4 B): window_blocks=[4,4], ws=8
12285///   * commit `"ijklm"` (5 B): 8+5>10 → pop front [4] (evict=4),
12286///     push 5 → window_blocks=[4,5], ws=9
12287///
12288/// `commit_space` then calls `retire_dictionary_budget(evicted)`. With
12289/// the fix `evicted=4`; with the bug it would be `evicted=5`. The
12290/// downstream `trim_after_budget_retire` cascade (which fires whenever
12291/// `retire_dictionary_budget` returns true) drives the budget further
12292/// down by trimming the now-oversize window; the final
12293/// `dictionary_retained_budget` differs between the two paths because
12294/// the cascade starting state differs (max_window_size after first
12295/// retire is `10 - evicted`).
12296///
12297/// Tracing the fix path end-to-end with starting budget = 100:
12298///   1st commit: evicted=0, no retire.
12299///   2nd commit: evicted=0, no retire.
12300///   3rd commit: evicted=4. retire(4) → budget=96, max_window=6.
12301///     trim_after_budget_retire:
12302///       iter1: ws=9 > max=6, pop [4] → ws=5, evicted=4.
12303///              retire(4) → budget=92, max_window=2.
12304///       iter2: ws=5 > max=2, pop [5] → ws=0, evicted=5.
12305///              retire(5) → budget=87, max_window=0.
12306///       iter3: ws=0, no trim, retire(0) → false, exit.
12307///   Final budget = 87. Final max_window_size = 0.
12308///
12309/// In the buggy path the 3rd commit would compute `evicted=5`, retire
12310/// would reclaim 5 instead of 4, shrinking max_window_size to 5
12311/// instead of 6 — and then the cascade arithmetic produces a
12312/// different final budget (and on the 2nd commit the cascade would
12313/// already have shrunk max_window_size to 0, causing the 3rd commit
12314/// to panic on `data.len() <= max_window_size`). Either way the
12315/// regression surfaces as a test failure.
12316#[test]
12317fn dfast_commit_space_eviction_uses_window_size_delta() {
12318    use crate::encoding::CompressionLevel;
12319
12320    let mut driver = MatchGeneratorDriver::new(10, 1);
12321    driver.reset(CompressionLevel::Level(3));
12322    assert!(matches!(driver.storage, MatcherStorage::Dfast(_)));
12323
12324    // Override the level-derived window with a tiny one so the
12325    // 4 + 4 + 5 = 13 commit sequence below actually crosses the
12326    // boundary. A 16 KiB+ default window would never evict on this
12327    // little data and the bug would stay invisible.
12328    driver.dfast_matcher_mut().max_window_size = 10;
12329    driver.dictionary_retained_budget = 100;
12330
12331    let mut space1 = Vec::with_capacity(64);
12332    space1.extend_from_slice(b"abcd");
12333    driver.commit_space(space1);
12334    assert_eq!(
12335        driver.dictionary_retained_budget, 100,
12336        "1st commit fills window 0 → 4, no eviction, no retire"
12337    );
12338
12339    let mut space2 = Vec::with_capacity(64);
12340    space2.extend_from_slice(b"efgh");
12341    driver.commit_space(space2);
12342    assert_eq!(
12343        driver.dictionary_retained_budget, 100,
12344        "2nd commit fills window 4 → 8, no eviction, no retire"
12345    );
12346
12347    let mut space3 = Vec::with_capacity(64);
12348    space3.extend_from_slice(b"ijklm");
12349    driver.commit_space(space3);
12350    assert_eq!(
12351        driver.dictionary_retained_budget, 87,
12352        "3rd commit + trim_after_budget_retire cascade. With the fix \
12353         (evicted=4 from window_size delta) the cascade reclaims 100 \
12354         → 96 → 92 → 87. With the bug (evicted=5 from data.len()) the \
12355         3rd commit would panic on `data.len() <= max_window_size` \
12356         after the 2nd commit's cascade had already shrunk \
12357         max_window_size to 0."
12358    );
12359    assert_eq!(
12360        driver.dfast_matcher_mut().max_window_size,
12361        0,
12362        "cascade drains max_window_size to 0 once budget reclaim \
12363         exceeds the initial window size"
12364    );
12365}
12366
12367#[test]
12368fn dfast_trim_to_window_evicts_oldest_block_by_length() {
12369    // After the history-only storage refactor (#111 Phase 7c step 3),
12370    // Dfast no longer retains input `Vec<u8>`s — the `history`
12371    // contiguous buffer is the sole byte store, and `add_data`
12372    // returns the input Vec to the caller's pool eagerly. So
12373    // `trim_to_window` doesn't have anything to hand back to the
12374    // closure (no Vec exists to give). The eviction is observable
12375    // instead through `window_size` shrinking by the per-block
12376    // length recorded in `window_blocks`.
12377    let mut matcher = DfastMatchGenerator::new(16);
12378
12379    let mut first = Vec::with_capacity(64);
12380    first.extend_from_slice(b"abcdefgh");
12381    matcher.add_data(first, |_| {});
12382
12383    let mut second = Vec::with_capacity(64);
12384    second.extend_from_slice(b"ijklmnop");
12385    matcher.add_data(second, |_| {});
12386
12387    assert_eq!(matcher.window_size, 16);
12388    assert_eq!(matcher.window_blocks.len(), 2);
12389
12390    matcher.max_window_size = 8;
12391
12392    matcher.trim_to_window();
12393
12394    // No callback signature to assert on: the Dfast variant of
12395    // `trim_to_window` takes none. That signature shape (vs HC/Row
12396    // which accept `impl FnMut(Vec<u8>)`) is the property locking in
12397    // the contract — there is no closure to invoke or skip, so no
12398    // future change can "start invoking the callback" without a
12399    // compile-time signature break that the dispatcher and this test
12400    // would force the author to address.
12401    assert_eq!(
12402        matcher.window_size, 8,
12403        "exactly one 8-byte block must remain"
12404    );
12405    assert_eq!(matcher.window_blocks.len(), 1);
12406    assert_eq!(matcher.history_abs_start, 8);
12407}
12408
12409#[test]
12410fn dfast_inserts_tail_positions_for_next_block_matching() {
12411    let mut matcher = DfastMatchGenerator::new(1 << 22);
12412
12413    matcher.add_data(b"012345bcdea".to_vec(), |_| {});
12414    let mut history = Vec::new();
12415    matcher.start_matching(|seq| match seq {
12416        Sequence::Literals { literals } => history.extend_from_slice(literals),
12417        Sequence::Triple { .. } => unreachable!("first block should not match history"),
12418    });
12419    assert_eq!(history, b"012345bcdea");
12420
12421    matcher.add_data(b"bcdeabcdeab".to_vec(), |_| {});
12422    let mut saw_first_sequence = false;
12423    matcher.start_matching(|seq| {
12424        assert!(!saw_first_sequence, "expected a single cross-block match");
12425        saw_first_sequence = true;
12426        match seq {
12427            Sequence::Literals { .. } => {
12428                panic!("expected tail-anchored cross-block match before any literals")
12429            }
12430            Sequence::Triple {
12431                literals,
12432                offset,
12433                match_len,
12434            } => {
12435                assert_eq!(literals, b"");
12436                assert_eq!(offset, 5);
12437                assert_eq!(match_len, 11);
12438                let start = history.len() - offset;
12439                for i in 0..match_len {
12440                    let byte = history[start + i];
12441                    history.push(byte);
12442                }
12443            }
12444        }
12445    });
12446
12447    assert!(
12448        saw_first_sequence,
12449        "expected tail-anchored cross-block match"
12450    );
12451    assert_eq!(history, b"012345bcdeabcdeabcdeab");
12452}
12453
12454/// Regression for #49 — locks down `MatchTable::backfill_boundary_positions`
12455/// for the [`HcMatchGenerator`] lazy path. `backfill_boundary_positions`
12456/// seeds ONLY the last `< 4` bytes of the previous slice (positions in
12457/// `[current_abs_start - 3, current_abs_start)`) — the bytes that
12458/// `insert_position` could not hash at the time because hashing needs
12459/// 4 bytes of lookahead. The existing 8 MiB window roundtrip test
12460/// exercises cross-slice behaviour end-to-end, but does not isolate
12461/// the backfill of those final 1-3 unhashable bytes.
12462///
12463/// Fixture is built so the cross-block match's candidate position
12464/// MUST lie in `[block_1_end - 3, block_1_end)`:
12465///
12466/// - Block 1 = `b"PQRSTBCD"` (8 bytes). Block 1's `start_matching`
12467///   hashes positions 0..=4 (each has 4 bytes of forward context);
12468///   positions 5/6/7 are the unhashable tail.
12469/// - Block 2 = `b"BCDBCDBCDB"` (10 bytes). At absolute position 8
12470///   (block 2 start) the 4-byte window is `b"BCDB"`. The ONLY place
12471///   `b"BCDB"` was inserted in the hash + chain tables is position 5
12472///   — via `backfill_boundary_positions` on the next-slice entry
12473///   (the 4-byte window at position 5 is `data[5..9] = b"BCD" +
12474///   block_2[0] = b"BCDB"`).
12475///
12476/// If `backfill_boundary_positions` regresses, position 5 is never
12477/// hashed, position 8's lookup misses, and the lazy parser falls
12478/// through to a leading literals run — `offset == 3, match_len >= 4`
12479/// would no longer hold.
12480#[test]
12481fn hashchain_inserts_tail_positions_for_next_block_matching() {
12482    let mut matcher = HcMatchGenerator::new(1 << 22);
12483    matcher.configure(HC_CONFIG, super::strategy::StrategyTag::Lazy, 22);
12484
12485    matcher.table.add_data(b"PQRSTBCD".to_vec(), |_| {});
12486    let mut history = alloc::vec::Vec::new();
12487    matcher.start_matching(|seq| match seq {
12488        Sequence::Literals { literals } => history.extend_from_slice(literals),
12489        Sequence::Triple { .. } => unreachable!("first block has no internal repeats"),
12490    });
12491    assert_eq!(history, b"PQRSTBCD");
12492
12493    matcher.table.add_data(b"BCDBCDBCDB".to_vec(), |_| {});
12494    let mut first_sequence_offset: Option<usize> = None;
12495    let mut first_sequence_match_len: Option<usize> = None;
12496    matcher.start_matching(|seq| {
12497        if first_sequence_offset.is_some() {
12498            return;
12499        }
12500        match seq {
12501            Sequence::Literals { .. } => {
12502                panic!(
12503                    "expected tail-anchored cross-block match before any literals — \
12504                     backfill_boundary_positions did not seed positions 5/6/7"
12505                )
12506            }
12507            Sequence::Triple {
12508                literals,
12509                offset,
12510                match_len,
12511            } => {
12512                assert_eq!(literals, b"", "no leading literals on the boundary match");
12513                first_sequence_offset = Some(offset);
12514                first_sequence_match_len = Some(match_len);
12515            }
12516        }
12517    });
12518
12519    let offset = first_sequence_offset.expect(
12520        "expected tail-anchored cross-block match emitted from backfill_boundary_positions",
12521    );
12522    assert!(
12523        (1..=3).contains(&offset),
12524        "boundary match offset {offset} must point into the unhashable tail \
12525         (positions 5/6/7 of an 8-byte block 1) so the test specifically \
12526         locks down backfill_boundary_positions",
12527    );
12528    assert_eq!(
12529        offset, 3,
12530        "candidate position must land at 5 (= block_1_len - 3) so the 4-byte \
12531         window `data[5..9] = b\"BCDB\"` matches block 2's first hash lookup",
12532    );
12533    let match_len = first_sequence_match_len.unwrap();
12534    assert!(
12535        match_len >= HC_MIN_MATCH_LEN,
12536        "match_len {match_len} must clear the HC min-match floor",
12537    );
12538}
12539
12540#[test]
12541fn dfast_dense_skip_matching_backfills_previous_tail_for_next_block() {
12542    let mut matcher = DfastMatchGenerator::new(1 << 22);
12543    let tail = b"Qz9kLm2Rp";
12544    let mut first = b"0123456789abcdef".to_vec();
12545    first.extend_from_slice(tail);
12546    matcher.add_data(first.clone(), |_| {});
12547    matcher.skip_matching(Some(false));
12548
12549    let mut second = tail.to_vec();
12550    second.extend_from_slice(b"after-tail-literals");
12551    matcher.add_data(second, |_| {});
12552
12553    let mut first_sequence = None;
12554    matcher.start_matching(|seq| {
12555        if first_sequence.is_some() {
12556            return;
12557        }
12558        first_sequence = Some(match seq {
12559            Sequence::Literals { literals } => (literals.len(), 0usize, 0usize),
12560            Sequence::Triple {
12561                literals,
12562                offset,
12563                match_len,
12564            } => (literals.len(), offset, match_len),
12565        });
12566    });
12567
12568    let (lit_len, offset, match_len) = first_sequence.expect("expected at least one sequence");
12569    assert_eq!(
12570        lit_len, 0,
12571        "expected immediate cross-block match at block start"
12572    );
12573    assert_eq!(
12574        offset,
12575        tail.len(),
12576        "expected dense skip to preserve cross-boundary tail match"
12577    );
12578    assert!(
12579        match_len >= DFAST_MIN_MATCH_LEN,
12580        "match length should satisfy dfast minimum match length"
12581    );
12582}
12583
12584#[test]
12585fn dfast_sparse_skip_matching_preserves_tail_cross_block_match() {
12586    let mut matcher = DfastMatchGenerator::new(1 << 22);
12587    let tail = b"Qz9kLm2Rp";
12588    let mut first = deterministic_high_entropy_bytes(0x9E37_79B9_7F4A_7C15, 4096);
12589    let tail_start = first.len() - tail.len();
12590    first[tail_start..].copy_from_slice(tail);
12591    matcher.add_data(first.clone(), |_| {});
12592
12593    matcher.skip_matching(Some(true));
12594
12595    let mut second = tail.to_vec();
12596    second.extend_from_slice(b"after-tail-literals");
12597    matcher.add_data(second, |_| {});
12598
12599    let mut first_sequence = None;
12600    matcher.start_matching(|seq| {
12601        if first_sequence.is_some() {
12602            return;
12603        }
12604        first_sequence = Some(match seq {
12605            Sequence::Literals { literals } => (literals.len(), 0usize, 0usize),
12606            Sequence::Triple {
12607                literals,
12608                offset,
12609                match_len,
12610            } => (literals.len(), offset, match_len),
12611        });
12612    });
12613
12614    let (lit_len, offset, match_len) = first_sequence.expect("expected at least one sequence");
12615    assert_eq!(
12616        lit_len, 0,
12617        "expected immediate cross-block match at block start"
12618    );
12619    assert_eq!(
12620        offset,
12621        tail.len(),
12622        "expected match against densely seeded tail"
12623    );
12624    assert!(
12625        match_len >= DFAST_MIN_MATCH_LEN,
12626        "match length should satisfy dfast minimum match length"
12627    );
12628}
12629
12630#[test]
12631fn dfast_skip_matching_dense_backfills_newly_hashable_long_tail_positions() {
12632    let mut matcher = DfastMatchGenerator::new(1 << 22);
12633    let first = deterministic_high_entropy_bytes(0x7A64_0315_D4E1_91C3, 4096);
12634    let first_len = first.len();
12635    matcher.add_data(first, |_| {});
12636    matcher.skip_matching_dense();
12637
12638    // Appending one byte makes exactly the previous block's last 7 starts
12639    // newly eligible for 8-byte long-hash insertion.
12640    matcher.add_data(alloc::vec![0xAB], |_| {});
12641    matcher.skip_matching_dense();
12642
12643    let target_abs_pos = first_len - 7;
12644    let target_rel = target_abs_pos - matcher.history_abs_start;
12645    let live = matcher.live_history();
12646    assert!(
12647        target_rel + 8 <= live.len(),
12648        "fixture must make the boundary start long-hashable"
12649    );
12650    let long_hash = matcher.long_hash_index(&live[target_rel..]);
12651    let target_slot = matcher.pack_slot(target_abs_pos);
12652    // Single-slot tables (upstream zstd parity): the bucket holds at most one
12653    // u32; the assertion below is a direct equality (no `.contains`).
12654    assert_ne!(
12655        target_slot, DFAST_EMPTY_SLOT,
12656        "pack_slot must never return the empty-slot sentinel for a real position"
12657    );
12658    assert_eq!(
12659        matcher.tables[long_hash], target_slot,
12660        "dense skip must seed long-hash entry for newly hashable boundary start"
12661    );
12662}
12663
12664#[test]
12665fn dfast_seed_remaining_hashable_starts_seeds_last_short_hash_positions() {
12666    let mut matcher = DfastMatchGenerator::new(1 << 20);
12667    let block = deterministic_high_entropy_bytes(0x13F0_9A6D_55CE_7B21, 64);
12668    matcher.add_data(block, |_| {});
12669    matcher.ensure_hash_tables();
12670
12671    let current_len = matcher.window_blocks.back().copied().unwrap_or(0);
12672    let current_abs_start = matcher.history_abs_start + matcher.window_size - current_len;
12673    let seed_start = current_len - DFAST_MIN_MATCH_LEN;
12674    matcher.seed_remaining_hashable_starts(current_abs_start, current_len, seed_start);
12675
12676    let target_abs_pos = current_abs_start + current_len - 5;
12677    let target_rel = target_abs_pos - matcher.history_abs_start;
12678    let live = matcher.live_history();
12679    assert!(
12680        target_rel + 5 <= live.len(),
12681        "fixture must leave the last short-hash start valid"
12682    );
12683    let short_hash = matcher.short_hash_index(&live[target_rel..]);
12684    let target_slot = matcher.pack_slot(target_abs_pos);
12685    assert_ne!(
12686        target_slot, DFAST_EMPTY_SLOT,
12687        "pack_slot must never return the empty-slot sentinel for a real position"
12688    );
12689    assert_eq!(
12690        matcher.tables[matcher.long_len() + short_hash],
12691        target_slot,
12692        "tail seeding must include the last 5-byte-hashable start"
12693    );
12694}
12695
12696#[test]
12697fn dfast_seed_remaining_hashable_starts_handles_pos_at_block_end() {
12698    let mut matcher = DfastMatchGenerator::new(1 << 20);
12699    let block = deterministic_high_entropy_bytes(0x7BB2_DA91_441E_C0EF, 64);
12700    matcher.add_data(block, |_| {});
12701    matcher.ensure_hash_tables();
12702
12703    let current_len = matcher.window_blocks.back().copied().unwrap_or(0);
12704    let current_abs_start = matcher.history_abs_start + matcher.window_size - current_len;
12705    matcher.seed_remaining_hashable_starts(current_abs_start, current_len, current_len);
12706
12707    let target_abs_pos = current_abs_start + current_len - 5;
12708    let target_rel = target_abs_pos - matcher.history_abs_start;
12709    let live = matcher.live_history();
12710    assert!(
12711        target_rel + 5 <= live.len(),
12712        "fixture must leave the last short-hash start valid"
12713    );
12714    let short_hash = matcher.short_hash_index(&live[target_rel..]);
12715    let target_slot = matcher.pack_slot(target_abs_pos);
12716    assert_ne!(
12717        target_slot, DFAST_EMPTY_SLOT,
12718        "pack_slot must never return the empty-slot sentinel for a real position"
12719    );
12720    assert_eq!(
12721        matcher.tables[matcher.long_len() + short_hash],
12722        target_slot,
12723        "tail seeding must still include the last 5-byte-hashable start when pos is at block end"
12724    );
12725}
12726
12727/// `ensure_room_for` must trigger `reduce()` when the requested
12728/// absolute position would push a relative offset past
12729/// `u32::MAX - DFAST_REBASE_GUARD_BAND`. After the rebase, the
12730/// pre-existing entry at a much-smaller absolute position falls
12731/// below `reducer` and gets cleared to `DFAST_EMPTY_SLOT`; a fresh
12732/// insert at the boundary position must `pack_slot` to a valid
12733/// non-sentinel value that `unpack_slot` resolves back to the same
12734/// absolute position. Mirrors `LdmHashTable::ensure_room_for_*`
12735/// from PR #139.
12736///
12737/// Runs on every target — `trigger_abs = u32::MAX -
12738/// DFAST_REBASE_GUARD_BAND + 1 = 0xC0000000`, which fits in `usize`
12739/// on i686 (`usize::MAX = u32::MAX`) without overflow, so the
12740/// packed-slot boundary path + u32 ↔ usize round-trip is exercised
12741/// on every pointer width we ship.
12742#[test]
12743fn dfast_ensure_room_for_rebases_above_guard_band() {
12744    let mut dfast = DfastMatchGenerator::new(1 << 22);
12745    dfast.set_hash_bits(10, 10);
12746    dfast.ensure_hash_tables();
12747
12748    // Seed an early insert near the current base in BOTH tables.
12749    // `ensure_room_for` / `reduce` is a shared contract for both
12750    // `short_hash` and `long_hash`; without seeding both, a
12751    // regression that only cleared short_hash would still pass.
12752    // Direct `pack_slot` + bucket write keeps the test focused on
12753    // the rebase mechanics and avoids dragging in the full
12754    // `insert_position` flow with its history/window setup.
12755    let early_abs = 1024usize;
12756    let early_packed = dfast.pack_slot(early_abs);
12757    assert_ne!(early_packed, DFAST_EMPTY_SLOT);
12758    let short0 = dfast.long_len();
12759    dfast.tables[short0] = early_packed;
12760    dfast.tables[0] = early_packed;
12761
12762    // Pick a trigger position that forces the first rebase. With
12763    // `position_base = 0`, the smallest `abs_pos` that fails the
12764    // `rel <= max_rel` test is `u32::MAX - DFAST_REBASE_GUARD_BAND
12765    // + 1`. After one `reduce(DFAST_REBASE_GUARD_BAND)` the base
12766    // advances by `DFAST_REBASE_GUARD_BAND`.
12767    let trigger_abs = (u32::MAX as usize) - (DFAST_REBASE_GUARD_BAND as usize) + 1;
12768    assert_eq!(dfast.position_base, 0);
12769    dfast.ensure_room_for(trigger_abs);
12770    assert_eq!(
12771        dfast.position_base, DFAST_REBASE_GUARD_BAND as usize,
12772        "rebase must advance position_base by DFAST_REBASE_GUARD_BAND"
12773    );
12774
12775    // The early entry at abs=1024 had packed slot 1025; the rebase
12776    // subtracts `DFAST_REBASE_GUARD_BAND` (= 2^30) from every slot.
12777    // 1025 <= 2^30 so the slot drops to the empty sentinel —
12778    // upstream zstd parity for `ZSTD_window_reduce`'s clamp-at-zero rule.
12779    // Verify BOTH tables — `reduce()` walks them in sequence.
12780    assert_eq!(
12781        dfast.tables[dfast.long_len()],
12782        DFAST_EMPTY_SLOT,
12783        "pre-rebase short-hash entries below the reducer must become empty"
12784    );
12785    assert_eq!(
12786        dfast.tables[0], DFAST_EMPTY_SLOT,
12787        "pre-rebase long-hash entries below the reducer must become empty"
12788    );
12789
12790    // A fresh insert past the rebase boundary must round-trip:
12791    // pack to a non-sentinel value, then unpack back to the same
12792    // absolute position via `position_base + slot - 1`.
12793    let post_packed = dfast.pack_slot(trigger_abs);
12794    assert_ne!(post_packed, DFAST_EMPTY_SLOT);
12795    let unpacked = dfast.position_base + (post_packed as usize) - 1;
12796    assert_eq!(
12797        unpacked, trigger_abs,
12798        "post-rebase pack/unpack must round-trip the absolute position"
12799    );
12800}
12801
12802#[test]
12803fn dfast_sparse_skip_matching_backfills_previous_tail_for_consecutive_sparse_blocks() {
12804    let mut matcher = DfastMatchGenerator::new(1 << 22);
12805    let boundary_prefix = [0xFA, 0xFB, 0xFC];
12806    let boundary_suffix = [0xFD, 0xEE, 0xAD, 0xBE, 0xEF, 0x11, 0x22, 0x33];
12807
12808    let mut first = deterministic_high_entropy_bytes(0xA5A5_5A5A_C3C3_3C3C, 4096);
12809    let first_tail_start = first.len() - boundary_prefix.len();
12810    first[first_tail_start..].copy_from_slice(&boundary_prefix);
12811    matcher.add_data(first, |_| {});
12812    matcher.skip_matching(Some(true));
12813
12814    let mut second = deterministic_high_entropy_bytes(0xA5A5_5A5A_C3C3_3C3C, 4096);
12815    second[..boundary_suffix.len()].copy_from_slice(&boundary_suffix);
12816    matcher.add_data(second.clone(), |_| {});
12817    matcher.skip_matching(Some(true));
12818
12819    let mut third = boundary_prefix.to_vec();
12820    third.extend_from_slice(&boundary_suffix);
12821    third.extend_from_slice(b"-trailing-literals");
12822    matcher.add_data(third, |_| {});
12823
12824    let mut first_sequence = None;
12825    matcher.start_matching(|seq| {
12826        if first_sequence.is_some() {
12827            return;
12828        }
12829        first_sequence = Some(match seq {
12830            Sequence::Literals { literals } => (literals.len(), 0usize, 0usize),
12831            Sequence::Triple {
12832                literals,
12833                offset,
12834                match_len,
12835            } => (literals.len(), offset, match_len),
12836        });
12837    });
12838
12839    let (lit_len, offset, match_len) = first_sequence.expect("expected at least one sequence");
12840    assert_eq!(
12841        lit_len, 0,
12842        "expected immediate match from the prior sparse-skip boundary"
12843    );
12844    assert_eq!(
12845        offset,
12846        second.len() + boundary_prefix.len(),
12847        "expected match against backfilled first→second boundary start"
12848    );
12849    assert!(
12850        match_len >= DFAST_MIN_MATCH_LEN,
12851        "match length should satisfy dfast minimum match length"
12852    );
12853}
12854
12855#[test]
12856fn fastest_hint_iteration_23_sequences_reconstruct_source() {
12857    fn generate_data(seed: u64, len: usize) -> Vec<u8> {
12858        let mut state = seed;
12859        let mut data = Vec::with_capacity(len);
12860        for _ in 0..len {
12861            state = state
12862                .wrapping_mul(6364136223846793005)
12863                .wrapping_add(1442695040888963407);
12864            data.push((state >> 33) as u8);
12865        }
12866        data
12867    }
12868
12869    let i = 23u64;
12870    let len = (i * 89 % 16384) as usize;
12871    let mut data = generate_data(i, len);
12872    // Append a repeated slice so the fixture deterministically exercises
12873    // the match path (Sequence::Triple) instead of only literals.
12874    let repeat = data[128..256].to_vec();
12875    data.extend_from_slice(&repeat);
12876    data.extend_from_slice(&repeat);
12877
12878    let mut driver = MatchGeneratorDriver::new(1024 * 128, 1);
12879    driver.set_source_size_hint(data.len() as u64);
12880    driver.reset(CompressionLevel::Fastest);
12881    let mut space = driver.get_next_space();
12882    space[..data.len()].copy_from_slice(&data);
12883    space.truncate(data.len());
12884    driver.commit_space(space);
12885
12886    let mut rebuilt = Vec::with_capacity(data.len());
12887    let mut saw_triple = false;
12888    driver.start_matching(|seq| match seq {
12889        Sequence::Literals { literals } => rebuilt.extend_from_slice(literals),
12890        Sequence::Triple {
12891            literals,
12892            offset,
12893            match_len,
12894        } => {
12895            saw_triple = true;
12896            rebuilt.extend_from_slice(literals);
12897            assert!(offset > 0, "offset must be non-zero");
12898            assert!(
12899                offset <= rebuilt.len(),
12900                "offset must reference already-produced bytes: offset={} produced={}",
12901                offset,
12902                rebuilt.len()
12903            );
12904            let start = rebuilt.len() - offset;
12905            for idx in 0..match_len {
12906                let b = rebuilt[start + idx];
12907                rebuilt.push(b);
12908            }
12909        }
12910    });
12911
12912    // Whether THIS specific iteration produces a Triple depends on
12913    // the matcher's step-skip schedule (upstream zstd-shape kernel walks ip0
12914    // with kSearchStrength-driven stride growth) — the legacy
12915    // SuffixStore-based matcher iterated every position and always
12916    // hit short repeats, but the upstream zstd-shape kernel may skip over
12917    // them when the step has grown large by the time it reaches the
12918    // repeat region. The substance of this test is the
12919    // reconstruction assertion below; `saw_triple` was a legacy
12920    // tuning preference, not a correctness invariant.
12921    let _ = saw_triple;
12922    assert_eq!(rebuilt, data);
12923}
12924
12925#[test]
12926fn fast_levels_dispatch_per_level_hash_log_and_mls() {
12927    // Level 1 — upstream zstd `{ 19, 13, 14, 1, 7, 0, ZSTD_fast }` row:
12928    // window_log=19, hash_log=14, mls=7.
12929    let f1 = resolve_level_params(CompressionLevel::Level(1), None)
12930        .fast
12931        .unwrap();
12932    assert_eq!(f1.hash_log, 14);
12933    assert_eq!(f1.mls, 7);
12934    assert_eq!(f1.step_size, 2);
12935
12936    // Negative levels — upstream zstd row-0 ("base for negative"):
12937    // hash_log=13, mls=7. The 32 KiB table is L1d-resident (every
12938    // probe an L1 hit, vs an L2 access for a 64 KiB hash_log=14
12939    // table), and minMatch=7 drops short-distance 6-byte matches —
12940    // upstream zstd parity on both ratio and throughput.
12941    // step_size follows upstream zstd's formula: targetLength = -level,
12942    // step_size = (-level) + 1, giving 2..8 for L-1..L-7.
12943    for n in -7..=-1 {
12944        let f = resolve_level_params(CompressionLevel::Level(n), None)
12945            .fast
12946            .unwrap();
12947        assert_eq!(f.hash_log, 13, "Level({n}) fast_hash_log");
12948        assert_eq!(f.mls, 7, "Level({n}) fast_mls");
12949        let expected_step = ((-n) as usize) + 1;
12950        assert_eq!(f.step_size, expected_step, "Level({n}) fast_step_size");
12951    }
12952
12953    // Fastest + Uncompressed keep hash_log=14 / mls=6 (their own
12954    // tuning; not part of the negative-level upstream zstd ladder).
12955    let pf = resolve_level_params(CompressionLevel::Fastest, None);
12956    let ff = pf.fast.unwrap();
12957    assert_eq!(
12958        (pf.window_log, ff.hash_log, ff.mls, ff.step_size),
12959        (19, 14, 6, 2),
12960    );
12961    // Uncompressed keeps window_log=17 (no history references, smaller
12962    // decoder reservation); fast cParams same as negative-base row.
12963    let pu = resolve_level_params(CompressionLevel::Uncompressed, None);
12964    let fu = pu.fast.unwrap();
12965    assert_eq!(
12966        (pu.window_log, fu.hash_log, fu.mls, fu.step_size),
12967        (17, 14, 6, 2),
12968    );
12969}
12970
12971/// Exercise the actual driver wiring: for every Fast level, reset a
12972/// `MatchGeneratorDriver` and assert the inner `FastKernelMatcher`
12973/// observed the same `(hash_log, mls, step_size)` tuple that
12974/// `resolve_level_params` reports. Catches plumbing bugs — argument
12975/// reordering, stale step_size carried from a prior frame,
12976/// stuck-on-default values — that the parameter-only test above
12977/// would miss.
12978#[test]
12979fn fast_levels_driver_wiring_threads_cparams_into_inner_matcher() {
12980    let mut driver = MatchGeneratorDriver::new(64 * 1024, 1);
12981
12982    let fast_levels = [
12983        CompressionLevel::Level(1),
12984        CompressionLevel::Fastest,
12985        CompressionLevel::Uncompressed,
12986        CompressionLevel::Level(-1),
12987        CompressionLevel::Level(-2),
12988        CompressionLevel::Level(-3),
12989        CompressionLevel::Level(-4),
12990        CompressionLevel::Level(-5),
12991        CompressionLevel::Level(-6),
12992        CompressionLevel::Level(-7),
12993    ];
12994
12995    for &level in &fast_levels {
12996        let p = resolve_level_params(level, None);
12997        // Sanity: every level in the table above must resolve to a
12998        // Fast-strategy row — otherwise this test isn't testing what
12999        // it claims to test.
13000        assert_eq!(
13001            p.strategy_tag,
13002            super::strategy::StrategyTag::Fast,
13003            "{level:?} must resolve to Fast strategy",
13004        );
13005
13006        // Bounce through a non-Fast strategy first so the next
13007        // reset actually goes through the backend-switch path
13008        // (`MatchGeneratorDriver::new` / `simple_mut` recreate the
13009        // Fast variant via `FastKernelMatcher::with_params`). Without
13010        // this hop the loop would only ever stay in `BackendTag::Simple`
13011        // and exercise `FastKernelMatcher::reset` — leaving the
13012        // `with_params` wiring untested on the production path.
13013        // `Default` resolves to Dfast strategy (a non-Fast row),
13014        // which is enough to force the swap.
13015        crate::encoding::Matcher::reset(&mut driver, CompressionLevel::Default);
13016
13017        // Drive the production reset path (same code paths exercised
13018        // by FrameCompressor / StreamingEncoder).
13019        crate::encoding::Matcher::reset(&mut driver, level);
13020
13021        let f = p.fast.unwrap();
13022        let m = driver.simple_mut();
13023        assert_eq!(
13024            m.hash_log(),
13025            f.hash_log,
13026            "{level:?}: inner matcher hash_log mismatch — argument swap?",
13027        );
13028        assert_eq!(
13029            m.mls(),
13030            f.mls,
13031            "{level:?}: inner matcher mls mismatch — argument swap?",
13032        );
13033        assert_eq!(
13034            m.step_size(),
13035            f.step_size,
13036            "{level:?}: inner matcher step_size mismatch — stale value carried from prior reset?",
13037        );
13038    }
13039}
13040
13041/// Pins `hc.target_len` to the reference `cParams.targetLength` from
13042/// `clevels.h` table[0] (default — `srcSize > 256 KB`) across levels
13043/// 5-15. The reference's lazy outer loop treats `targetLength` as
13044/// `sufficient_len` — the "nice match" threshold that breaks the chain
13045/// walk as soon as a candidate reaches that length.
13046///
13047/// Levels 13-15 run btlazy2 in the reference and the hash-chain Lazy
13048/// parser here, but the reference `targetLength` (32) is the same nice-match
13049/// threshold for both finders, so we mirror it directly.
13050///
13051/// Asserts against the constant `clevels.h` table[0] `targetLength` column
13052/// (transcribed inline) — a pure-Rust in-tree test, no FFI dependency.
13053#[test]
13054fn lazy_band_target_len_matches_default_table() {
13055    // table[0] (srcSize > 256 KB) targetLength, levels 5..=15: the lazy
13056    // outer loop's nice-match (`sufficient_len`) threshold.
13057    let expected: [(i32, usize); 11] = [
13058        (5, 2),
13059        (6, 4),
13060        (7, 8),
13061        (8, 16),
13062        (9, 16),
13063        (10, 16),
13064        (11, 16),
13065        (12, 32),
13066        (13, 32),
13067        (14, 32),
13068        (15, 32),
13069    ];
13070    for (level, want) in expected {
13071        let params = resolve_level_params(CompressionLevel::Level(level), None);
13072        // L5 = greedy (Row backend → `row`); L6-15 = lazy (HashChain → `hc`).
13073        let target_len = params
13074            .hc
13075            .map(|hc| hc.target_len)
13076            .or_else(|| params.row.map(|row| row.target_len))
13077            .expect("lazy/greedy level carries hc or row config");
13078        assert_eq!(target_len, want, "L{level}: target_len must match table[0]");
13079    }
13080}
13081
13082/// Levels 13-15 mirror the reference btlazy2 window/hash/chain/search
13083/// budget from `clevels.h` table[0]: `search_depth == 1 << cParams.searchLog`
13084/// (16 / 32 / 64) plus `window_log` / `hash_log` / `chain_log` equal to the
13085/// reference `windowLog` / `hashLog` / `chainLog`. We run them on the
13086/// hash-chain Lazy parser rather than a binary-tree finder, so they do not
13087/// re-establish a strict ratio ladder above L12 on window-fitting inputs;
13088/// asserting the full row (not just `search_depth`) keeps the whole budget
13089/// aligned and guards every field against silent drift.
13090#[test]
13091fn upper_lazy_band_params_match_default_table() {
13092    // table[0] (srcSize > 256 KB), levels 13..=15 (btlazy2 budget):
13093    // (level, windowLog, hashLog, chainLog, search_depth = 1 << searchLog).
13094    let expected: [(i32, u8, usize, usize, usize); 3] = [
13095        (13, 22, 22, 22, 1 << 4),
13096        (14, 22, 23, 22, 1 << 5),
13097        (15, 22, 23, 23, 1 << 6),
13098    ];
13099    for (level, wlog, hlog, clog, sd) in expected {
13100        let params = resolve_level_params(CompressionLevel::Level(level), None);
13101        let hc = params.hc.unwrap();
13102        assert_eq!(hc.search_depth, sd, "L{level}: search_depth");
13103        assert_eq!(params.window_log, wlog, "L{level}: window_log");
13104        assert_eq!(hc.hash_log, hlog, "L{level}: hash_log");
13105        assert_eq!(hc.chain_log, clog, "L{level}: chain_log");
13106    }
13107}
structured_zstd/encoding/match_generator.rs

structured_zstd/encoding/
match_generator.rs