structured_zstd/encoding/
match_generator.rs

1//! Matching algorithm used find repeated parts in the original data
2//!
3//! The Zstd format relies on finden repeated sequences of data and compressing these sequences as instructions to the decoder.
4//! A sequence basically tells the decoder "Go back X bytes and copy Y bytes to the end of your decode buffer".
5//!
6//! The task here is to efficiently find matches in the already encoded data for the current suffix of the not yet encoded data.
7
8use alloc::vec::Vec;
9// SIMD/CRC intrinsics now live in `crate::encoding::fastpath::*` where they
10// sit under per-CPU `#[target_feature]` umbrellas; no architecture-specific
11// intrinsic imports remain in this file.
12use super::CompressionLevel;
13use super::Matcher;
14use super::Sequence;
15use super::blocks::encode_offset_with_history;
16use super::bt::BtMatcher;
17#[cfg(test)]
18use super::cost_model::HC_MAX_LIT;
19use super::cost_model::{
20    HC_BITCOST_MULTIPLIER, HC_FORMAT_MINMATCH, HC_OPT_NODE_LEN, HC_OPT_NUM, HC_OPT_PRICE_ARENA_LEN,
21    HC_OPT_PRICE_STRIDE, HC_PREDEF_THRESHOLD, HcOptState, HcOptimalCostProfile,
22};
23#[cfg(test)]
24use super::cost_model::{HC_BLOCKSIZE_MAX, HC_MAX_LL, HC_MAX_ML, HC_MAX_OFF, HcOptPriceType};
25use super::dfast::DfastMatchGenerator;
26// FAST_HASH_FILL_STEP test-only re-export was tied to the legacy
27// SuffixStore MatchGenerator's interleaved hash-fill stride. The
28// upstream zstd-shape Fast kernel walks ip0 with kSearchStrength step-skip
29// acceleration instead, so the constant has no consumer in the
30// remaining live test set today.
31#[cfg(test)]
32use super::match_table::helpers::INCOMPRESSIBLE_SKIP_STEP;
33use super::match_table::helpers::MIN_MATCH_LEN;
34#[cfg(test)]
35use super::match_table::helpers::common_prefix_len;
36#[cfg(test)]
37use super::opt::ldm::HcRawSeq;
38use super::opt::ldm::{HcOptLdmState, HcRawSeqStore};
39use super::opt::types::{
40    HcCandidateQuery, HcOptimalNode, HcOptimalPlanBuffers, HcOptimalPlanState, HcOptimalSequence,
41    MatchCandidate,
42};
43use super::row::RowMatchGenerator;
44use super::simple::fast_matcher::{FAST_LEVEL_1_HASH_LOG, FAST_LEVEL_1_MLS, FastKernelMatcher};
45#[cfg(all(
46    test,
47    feature = "std",
48    target_arch = "aarch64",
49    target_endian = "little"
50))]
51use std::arch::is_aarch64_feature_detected;
52#[cfg(all(test, feature = "std", target_arch = "x86_64"))]
53use std::arch::is_x86_feature_detected;
54
55pub(crate) const DFAST_MIN_MATCH_LEN: usize = 5;
56// Bytes the dfast short hash reads (upstream zstd `mls = 5`). Seeding / lookahead
57// guards use it so a position is only short-hashed once its full 5-byte key
58// is in range.
59pub(crate) const DFAST_SHORT_HASH_LOOKAHEAD: usize = 5;
60pub(crate) const ROW_MIN_MATCH_LEN: usize = 5;
61// Upstream zstd `clevels.h:31` at level 3 large-input bucket sets
62// `hashLog = 17` (the long-hash table) and `chainLog = 16` (the
63// short-hash table — upstream zstd names this `chainTable` even though for
64// dfast it's used as a plain single-slot hash). Each table holds one
65// `U32` per slot; the upstream zstd overwrites on collision and recovers
66// compression quality via the inline `_search_next_long` retry
67// (after a short-hash hit, probes `hashLong[hl1]` at `ip + 1` and
68// keeps the longer match).
69//
70// We mirror that storage layout: single `u32` per bucket (no
71// `[u32; N]` array), `long_hash` sized `1 << DFAST_HASH_BITS` and
72// `short_hash` one bit smaller via `DFAST_SHORT_HASH_BITS_DELTA`.
73// Two-table footprint at Level 3: `2^17 × 4 + 2^16 × 4 = 768 KiB`,
74// exact upstream parity. The `_search_next_long` retry lives in
75// `DfastMatchGenerator::hash_candidate` (called via
76// `best_match`). Earlier revisions kept a
77// 4-slot bucket per hash position; that paid 4× the upstream zstd memory
78// without measurable ratio gain once the retry was in place.
79//
80// `dfast_hash_bits_for_window` still clamps the runtime long-hash
81// value to `[MIN_WINDOW_LOG, DFAST_HASH_BITS]`, so this const is the
82// upper bound rather than a fixed default.
83pub(crate) const DFAST_HASH_BITS: usize = 17;
84/// Difference between `long_hash_bits` and `short_hash_bits` —
85/// upstream zstd `hashLog - chainLog` is 1 at every dfast level (`clevels.h`
86/// level 2: 16-15=1; level 3: 17-16=1). The short hash is one bit
87/// smaller than the long hash so the per-bucket footprint matches
88/// upstream zstd sizing exactly.
89pub(crate) const DFAST_SHORT_HASH_BITS_DELTA: usize = 1;
90/// Sentinel value for an empty slot in the dfast hash tables. Real
91/// positions are stored as `(abs_pos - position_base + 1) as u32`, so
92/// `0` is reserved as the "empty" marker and a true relative offset
93/// of `0` never appears in the table. Mirrors the LDM table's
94/// `LdmEntry.offset == 0` convention (see `encoding/ldm/table.rs`)
95/// so both rebasing structures share
96/// one sentinel scheme.
97pub(crate) const DFAST_EMPTY_SLOT: u32 = 0;
98
99/// Guard band reserved above the high-water mark before triggering a
100/// rebase on the Dfast hash tables. When the next insert would push a
101/// relative offset above `u32::MAX - DFAST_REBASE_GUARD_BAND`, the
102/// table calls `reduce(GUARD_BAND)` to shift every slot down and
103/// advance `position_base` so future inserts stay inside the `u32`
104/// window. Same scheme as `encoding/ldm/table.rs`.
105pub(crate) const DFAST_REBASE_GUARD_BAND: u32 = 1u32 << 30;
106pub(crate) const DFAST_SKIP_SEARCH_STRENGTH: usize = 6;
107pub(crate) const DFAST_SKIP_STEP_GROWTH_INTERVAL: usize = 1 << DFAST_SKIP_SEARCH_STRENGTH;
108pub(crate) const DFAST_MAX_SKIP_STEP: usize = 8;
109pub(crate) const DFAST_INCOMPRESSIBLE_SKIP_STEP: usize = 16;
110pub(crate) const ROW_HASH_BITS: usize = 20;
111pub(crate) const ROW_LOG: usize = 5;
112pub(crate) const ROW_SEARCH_DEPTH: usize = 16;
113pub(crate) const ROW_TARGET_LEN: usize = 48;
114pub(crate) const ROW_TAG_BITS: usize = 8;
115pub(crate) const ROW_EMPTY_SLOT: u32 = u32::MAX;
116pub(crate) const ROW_HASH_KEY_LEN: usize = 4;
117// HASH_MIX_PRIME now lives in `crate::encoding::fastpath::scalar`; the four
118// per-CPU `hash_mix_u64` variants share it via that module.
119// HC_PRIME3BYTES / HC_PRIME4BYTES moved to match_table::storage
120// alongside the hash helpers in Phase 1e Stage A. Only the test
121// module references the constants directly (production code goes
122// through `MatchTable::hash_value_with_mls`).
123#[cfg(test)]
124use super::match_table::storage::{HC_PRIME3BYTES, HC_PRIME4BYTES};
125
126// HC_HASH_LOG / HC_CHAIN_LOG / HC3_HASH_LOG / HC_EMPTY live on the
127// shared storage module so MatchTable methods can reference them
128// without pulling in this module. Re-imported here so existing
129// macros / configs / tests keep their unqualified names.
130#[cfg(test)]
131use super::match_table::storage::HC_EMPTY;
132use super::match_table::storage::HC3_HASH_LOG;
133// HC_HASH_LOG / HC_CHAIN_LOG feed the test-only `HC_CONFIG` default.
134#[cfg(test)]
135use super::match_table::storage::{HC_CHAIN_LOG, HC_HASH_LOG};
136// HC3_MAX_OFFSET moved to encoding::bt alongside the hash3 candidate
137// probe macro that consumes it; the macro references it via the
138// fully-qualified `$crate::encoding::bt::HC3_MAX_OFFSET` path so this
139// module no longer needs a local import.
140const HC_SEARCH_DEPTH: usize = 16;
141// HC_MIN_MATCH_LEN moved to encoding::hc; re-imported here so
142// existing references compile unchanged.
143use super::hc::HC_MIN_MATCH_LEN;
144const HC_OPT_MIN_MATCH_LEN: usize = HC_FORMAT_MINMATCH;
145const HC_TARGET_LEN: usize = 48;
146
147// MAX_HC_SEARCH_DEPTH moved to encoding::hc alongside chain_candidates.
148use super::hc::MAX_HC_SEARCH_DEPTH;
149
150// `Strategy` and `StrategyTag` live in `crate::encoding::strategy`.
151// The driver carries a `StrategyTag` field set at `reset()` and
152// dispatches each block into a monomorphised `compress_block::<S>`
153// per concrete strategy.
154
155/// Bundled tuning knobs for the hash-chain matcher. Using a typed config
156/// instead of positional `usize` args eliminates parameter-order hazards.
157#[derive(Copy, Clone, PartialEq, Eq)]
158struct HcConfig {
159    hash_log: usize,
160    chain_log: usize,
161    search_depth: usize,
162    target_len: usize,
163    /// Binary-tree finder hash width (upstream zstd `mls = BOUNDED(4, minMatch, 6)`),
164    /// carried explicitly per level so it is NOT inferred from `target_len`
165    /// (a `target_length` override must not silently flip the finder between
166    /// 5- and 4-byte hashing). Only the BT body reads it; HC/lazy levels keep
167    /// it at 4 (their `hash_position` is always 4-byte). 5 for the
168    /// minMatch=5 BT levels (btlazy2 + btopt L16), 4 elsewhere.
169    search_mls: usize,
170}
171
172#[derive(Copy, Clone, PartialEq, Eq)]
173pub(crate) struct RowConfig {
174    pub(crate) hash_bits: usize,
175    pub(crate) row_log: usize,
176    pub(crate) search_depth: usize,
177    pub(crate) target_len: usize,
178    /// Upstream zstd `cParams.minMatch` for the row matcher: the regular-search
179    /// acceptance floor (a row candidate must extend to >= `mls` bytes).
180    /// The C-like advanced API surfaces this as the row min-match knob.
181    /// `ROW_MIN_MATCH_LEN` (5) is the default; the row hash key width stays
182    /// 4 bytes (an internal detail), so this only tunes the acceptance
183    /// floor, not the candidate hash distribution.
184    pub(crate) mls: usize,
185}
186
187// Only used as the default HashChain config when the test-only parse×search
188// override pairs a level with a backend its native row doesn't populate.
189#[cfg(test)]
190const HC_CONFIG: HcConfig = HcConfig {
191    hash_log: HC_HASH_LOG,
192    chain_log: HC_CHAIN_LOG,
193    search_depth: HC_SEARCH_DEPTH,
194    target_len: HC_TARGET_LEN,
195    search_mls: 4,
196};
197
198/// Base HashChain config synthesized when a public-parameter strategy
199/// override ([`super::parameters`]) routes a level to the HC / BT
200/// backend whose native level row didn't populate `hc` (e.g. forcing
201/// `Strategy::Lazy2` onto a level the table resolves to Fast). Mirrors
202/// the mid-band lazy defaults; the per-knob overrides then refine it.
203const HC_OVERRIDE_DEFAULT: HcConfig = HcConfig {
204    hash_log: super::match_table::storage::HC_HASH_LOG,
205    chain_log: super::match_table::storage::HC_CHAIN_LOG,
206    search_depth: HC_SEARCH_DEPTH,
207    target_len: HC_TARGET_LEN,
208    search_mls: 4,
209};
210
211const BTULTRA2_HC_CONFIG: HcConfig = HcConfig {
212    hash_log: 24,
213    chain_log: 24,
214    search_depth: 512,
215    target_len: 256,
216    search_mls: 4,
217};
218
219const BTULTRA2_HC_CONFIG_L22: HcConfig = HcConfig {
220    hash_log: 25,
221    chain_log: 27,
222    search_depth: 512,
223    target_len: 999,
224    search_mls: 4,
225};
226
227const BTULTRA2_HC_CONFIG_L22_256K: HcConfig = HcConfig {
228    hash_log: 19,
229    chain_log: 19,
230    search_depth: 1 << 13,
231    target_len: 999,
232    search_mls: 4,
233};
234
235const BTULTRA2_HC_CONFIG_L22_128K: HcConfig = HcConfig {
236    hash_log: 17,
237    chain_log: 18,
238    search_depth: 1 << 11,
239    target_len: 999,
240    search_mls: 4,
241};
242
243const BTULTRA2_HC_CONFIG_L22_16K: HcConfig = HcConfig {
244    hash_log: 15,
245    chain_log: 15,
246    search_depth: 1 << 10,
247    target_len: 999,
248    search_mls: 4,
249};
250
251// Default Row config: only used by tests and the test-only parse×search
252// override (production greedy L5 carries its own `ROW_L5`).
253#[cfg(test)]
254const ROW_CONFIG: RowConfig = RowConfig {
255    hash_bits: ROW_HASH_BITS,
256    row_log: ROW_LOG,
257    search_depth: ROW_SEARCH_DEPTH,
258    target_len: ROW_TARGET_LEN,
259    mls: ROW_MIN_MATCH_LEN,
260};
261
262// Level-5 greedy is the ONLY strategy routed to the Row backend
263// (`StrategyTag::backend`: greedy -> Row; lazy / btopt / btultra* ->
264// HashChain), so it is the only level whose `row:` field is read. The upstream zstd
265// `clevels.h` default row (srcSize > 256 KB) for level 5 is searchLog=3,
266// targetLength=2, from which the row matcher derives:
267//   rowLog       = clamp(searchLog, 4, 6) = 4
268//   search_depth = 1 << min(searchLog, rowLog) = 8   (= nbAttempts)
269//   target_len   = targetLength = 2                  (nice-match early-out)
270// The shared `ROW_CONFIG` (row_log=5, search_depth=16, target_len=48) ran a
271// level-12-grade search here: 16 slots per row, never early-exiting until a
272// 48-byte match. That exhaustive walk was the dominant cost in greedy L5's
273// encode-speed regression vs FFI. `hash_bits` matches upstream zstd's
274// `ZSTD_getCParams(5, .., 0).hashLog` = 19 (verified via
275// `cparams_check 5`), so the row table is the same width as upstream's
276// (2^19 slots); the previous `ROW_HASH_BITS` (20) doubled both row tables vs
277// upstream, the dominant peak-memory excess on the greedy band.
278const ROW_L5: RowConfig = RowConfig {
279    hash_bits: 19,
280    row_log: 4,
281    search_depth: 8,
282    target_len: 2,
283    mls: ROW_MIN_MATCH_LEN,
284};
285
286// Upstream zstd `clevels.h` unbounded defaults for the lazy band, verified via
287// `ZSTD_getCParams(level, 0, 0)`:
288//   L6  { w21 c18 h19 s3 mml5 t4  lazy  } → rowLog 4, depth 1<<3 = 8
289//   L7  { w21 c19 h20 s4 mml5 t8  lazy  } → rowLog 4, depth 16
290//   L8  { w21 c19 h20 s4 mml5 t16 lazy2 } → rowLog 4, depth 16
291//   L9  { w22 c20 h21 s4 mml5 t16 lazy2 } → rowLog 4, depth 16
292//   L10 { w22 c21 h22 s5 mml5 t16 lazy2 } → rowLog 5, depth 32
293//   L11 { w22 c21 h22 s6 mml5 t16 lazy2 } → rowLog 6, depth 64
294//   L12 { w22 c22 h23 s6 mml5 t32 lazy2 } → rowLog 6, depth 64
295// `rowLog = clamp(searchLog, 4, 6)`, `depth = 1 << min(searchLog, rowLog)`
296// (same derivation as `ROW_L5` above). `hash_bits` carries the upstream zstd
297// `hashLog`; the hinted-source clamp in `configure` caps it by the window
298// exactly like the upstream zstd `ZSTD_adjustCParams` path.
299const ROW_L6: RowConfig = RowConfig {
300    hash_bits: 19,
301    row_log: 4,
302    search_depth: 8,
303    target_len: 4,
304    mls: ROW_MIN_MATCH_LEN,
305};
306const ROW_L7: RowConfig = RowConfig {
307    hash_bits: 20,
308    row_log: 4,
309    search_depth: 16,
310    target_len: 8,
311    mls: ROW_MIN_MATCH_LEN,
312};
313const ROW_L8: RowConfig = RowConfig {
314    hash_bits: 20,
315    row_log: 4,
316    search_depth: 16,
317    target_len: 16,
318    mls: ROW_MIN_MATCH_LEN,
319};
320const ROW_L9: RowConfig = RowConfig {
321    hash_bits: 21,
322    row_log: 4,
323    search_depth: 16,
324    target_len: 16,
325    mls: ROW_MIN_MATCH_LEN,
326};
327const ROW_L10: RowConfig = RowConfig {
328    hash_bits: 22,
329    row_log: 5,
330    search_depth: 32,
331    target_len: 16,
332    mls: ROW_MIN_MATCH_LEN,
333};
334const ROW_L11: RowConfig = RowConfig {
335    hash_bits: 22,
336    row_log: 6,
337    search_depth: 64,
338    target_len: 16,
339    mls: ROW_MIN_MATCH_LEN,
340};
341const ROW_L12: RowConfig = RowConfig {
342    hash_bits: 23,
343    row_log: 6,
344    search_depth: 64,
345    target_len: 32,
346    mls: ROW_MIN_MATCH_LEN,
347};
348
349/// Per-level Double-Fast hash sizing, mirroring the upstream zstd `clevels.h` columns
350/// (config-driven, not a hardcoded constant): `long_hash_log` =
351/// `cParams.hashLog` (the long 8-byte hash table), `short_hash_log` =
352/// `cParams.chainLog` (the short hash table dfast repurposes as its
353/// secondary index). Only the Dfast backend reads it, so non-dfast level
354/// rows carry `dfast: None`. `minMatch` stays the upstream zstd-fixed `5`
355/// (`DFAST_MIN_MATCH_LEN`, used in const contexts).
356#[derive(Copy, Clone, PartialEq, Eq)]
357struct DfastConfig {
358    long_hash_log: u8,
359    short_hash_log: u8,
360}
361
362// Upstream zstd clevels.h default row (srcSize > 256 KB): L3 {hashLog 17, chainLog 16},
363// L4 {hashLog 18, chainLog 18}.
364const DFAST_L3: DfastConfig = DfastConfig {
365    long_hash_log: 17,
366    short_hash_log: 16,
367};
368const DFAST_L4: DfastConfig = DfastConfig {
369    long_hash_log: 18,
370    short_hash_log: 18,
371};
372
373/// Per-level Fast-strategy tuning, only consumed by the `FastKernelMatcher`
374/// (Simple backend): `hash_log` = upstream zstd `cParams.hashLog`, `mls` = upstream zstd
375/// `cParams.minMatch` (4..=8), `step_size` = upstream zstd `stepSize`. Carried as
376/// `LevelParams.fast` (`Some` only on Fast level rows; `None` elsewhere).
377#[derive(Copy, Clone, PartialEq, Eq)]
378struct FastConfig {
379    hash_log: u32,
380    mls: u32,
381    step_size: usize,
382}
383
384const FAST_L1: FastConfig = FastConfig {
385    hash_log: 14,
386    // Tier-0 (srcSize > 256 KiB) `cParams.minMatch`. Upstream zstd selects the
387    // Level-1 row from a 4-way srcSize-tiered table (`ZSTD_getCParams_internal`
388    // → `ZSTD_defaultCParameters[tableID][1]`), and minMatch shrinks for
389    // smaller inputs: 7 (>256 KiB) / 6 (16..256 KiB) / 5 (<=16 KiB). The base
390    // here is the tier-0 value; `fast_l1_mls_for_source_size` lowers it per the
391    // tier in `adjust_params_for_source_size`.
392    mls: 7,
393    step_size: 2,
394};
395const FAST_L2: FastConfig = FastConfig {
396    hash_log: 16,
397    mls: 6,
398    step_size: 2,
399};
400
401/// Resolved tuning parameters for a compression level. The
402/// [`StrategyTag`] is the single source of truth for the backend
403/// family and the compile-time strategy consts; the runtime
404/// [`BackendTag`] used by the driver dispatcher is derived via
405/// [`StrategyTag::backend`] so the two cannot drift.
406#[derive(Copy, Clone, PartialEq, Eq)]
407struct LevelParams {
408    strategy_tag: super::strategy::StrategyTag,
409    /// Decoupled search-method axis. Independent of `strategy_tag`'s
410    /// parse half: a level can pair any parse (greedy / lazy depth via
411    /// `lazy_depth`) with any search backend here. Defaults to the
412    /// historical pairing (`strategy_tag.search()`) but is overridable
413    /// per level so the parse×search matrix can be swept and tuned.
414    search: super::strategy::SearchMethod,
415    window_log: u8,
416    lazy_depth: u8,
417    /// Per-strategy tuning. Exactly one is `Some` on each level row, matching
418    /// `strategy_tag`'s backend, so the table self-documents which knobs a
419    /// level actually consumes (the others are `None`, not dead placeholders):
420    /// `fast` for the Fast/Simple backend, `dfast` for Double-Fast, `hc` for
421    /// the HashChain (lazy / btopt / btultra*) backend, `row` for the Row
422    /// (greedy L5) backend.
423    fast: Option<FastConfig>,
424    dfast: Option<DfastConfig>,
425    hc: Option<HcConfig>,
426    row: Option<RowConfig>,
427}
428
429impl LevelParams {
430    /// Backend family (storage variant) for the driver dispatcher.
431    /// Derived from the decoupled `search` axis so a level can route to
432    /// a different search backend than its `strategy_tag` historically
433    /// implied.
434    fn backend(&self) -> super::strategy::BackendTag {
435        self.search.backend()
436    }
437
438    /// Parse mode derived from the decoupled `search` axis: the binary-tree
439    /// search path carries `ParseMode::Optimal`; every other search backend
440    /// derives greedy/lazy/lazy2 from `lazy_depth`. Reading `search` (not the
441    /// strategy tag) keeps the parse×search decoupling complete even when a
442    /// level whose tag is `Bt*` is overridden to a non-BT search backend.
443    fn parse(&self) -> super::strategy::ParseMode {
444        match self.search {
445            super::strategy::SearchMethod::BinaryTree => super::strategy::ParseMode::Optimal,
446            _ => super::strategy::ParseMode::from_lazy_depth(self.lazy_depth),
447        }
448    }
449
450    /// Cheap fingerprint pre-splitter level (the C-like `blockSplitterLevel`):
451    /// the EFFECTIVE upstream `ZSTD_splitBlock` level that
452    /// `ZSTD_optimalBlockSize` dispatches, i.e. `splitLevels[strategy] - 2`
453    /// (clamped at 0), NOT the raw `splitLevels[]` value. `split_level == 0`
454    /// routes to the cheap from-borders heuristic; `1..=4` to byChunks with
455    /// internal sampling level `split_level - 1`. See the body for the
456    /// per-strategy tier table and why the raw-table mapping was wrong.
457    fn pre_split(&self) -> Option<u8> {
458        use super::strategy::StrategyTag;
459        // Effective upstream `ZSTD_splitBlock` level = `splitLevels[strat] - 2`
460        // (clamped at 0). Upstream `splitLevels[] = {0,0,1,2,2,3,3,4,4,4}` then
461        // subtracts 2 before dispatch, so the byChunks sampling tier is two
462        // steps coarser than the raw table: greedy/lazy(d1)=0 (from-borders),
463        // lazy2/btlazy2=1 (byChunks rate 43), btopt+=2 (byChunks rate 11).
464        // An earlier version mirrored the RAW table AND bumped lazy2 to the
465        // rate-1 full scan (split 4) to dodge a periodic-input phantom-split —
466        // that ran the pre-splitter at up to 43x upstream's sampling cost
467        // (~87% of L9 encode time on the decode corpus). Per the drop-in
468        // contract ratio only needs to stay <= upstream, so matching upstream's
469        // sampling tier (and accepting upstream's identical over-split on
470        // periodic input) is the dominant large-input encode-speed win.
471        Some(match self.strategy_tag {
472            // splitLevels 0/1 -> 0: upstream does not pre-split fast/dfast at
473            // all; from-borders is the cheapest stand-in and rarely splits.
474            StrategyTag::Fast | StrategyTag::Dfast => 0,
475            // greedy / lazy(depth 1): splitLevels 2 -> 0 (from-borders).
476            StrategyTag::Greedy => 0,
477            StrategyTag::Lazy => {
478                if self.lazy_depth >= 2 {
479                    1 // lazy2: splitLevels 3 -> 1 (byChunks rate 43)
480                } else {
481                    0 // lazy depth 1: splitLevels 2 -> 0 (from-borders)
482                }
483            }
484            StrategyTag::Btlazy2 => 1, // splitLevels 3 -> 1 (byChunks rate 43)
485            StrategyTag::BtOpt | StrategyTag::BtUltra | StrategyTag::BtUltra2 => 2,
486        })
487    }
488}
489
490/// Apply the public-parameter per-knob overrides (#27) onto the
491/// level-resolved [`LevelParams`], in place. Runs in [`Matcher::reset`]
492/// after the level params are computed and before backend selection, so
493/// a strategy override re-routes the backend uniformly. An all-`None`
494/// override is a no-op the caller skips via
495/// [`super::parameters::ParamOverrides::is_empty`], keeping the default
496/// level geometry byte-identical.
497fn apply_param_overrides(params: &mut LevelParams, ov: &super::parameters::ParamOverrides) {
498    use super::strategy::SearchMethod;
499
500    // 1. Strategy override re-derives tag / search / lazy depth.
501    if let Some(strategy) = ov.strategy {
502        let tag = strategy.tag();
503        params.strategy_tag = tag;
504        params.search = tag.search();
505        params.lazy_depth = strategy.lazy_depth();
506    }
507
508    // 2. Ensure the active backend's config row exists (synthesize a
509    //    default when a strategy override moved off the native row).
510    match params.search {
511        SearchMethod::Fast => {
512            params.fast.get_or_insert(FAST_L1);
513        }
514        SearchMethod::DoubleFast => {
515            params.dfast.get_or_insert(DFAST_L3);
516        }
517        SearchMethod::RowHash => {
518            params.row.get_or_insert(ROW_L5);
519        }
520        SearchMethod::HashChain | SearchMethod::BinaryTree => {
521            // A `Btlazy2` strategy override moved off a non-HC row needs the
522            // BT 5-byte finder hash (upstream zstd minMatch 5); other synthesized HC
523            // rows keep the 4-byte default. An explicit `min_match` override
524            // below refines this further.
525            params.hc.get_or_insert(HcConfig {
526                search_mls: if matches!(params.strategy_tag, super::strategy::StrategyTag::Btlazy2)
527                {
528                    5
529                } else {
530                    HC_OVERRIDE_DEFAULT.search_mls
531                },
532                ..HC_OVERRIDE_DEFAULT
533            });
534        }
535    }
536
537    // 3. window_log (bounds-checked at <= 30 by the builder).
538    if let Some(window_log) = ov.window_log {
539        params.window_log = window_log;
540    }
541
542    // 4. Per-backend numeric knobs map into the active config, mirroring
543    //    the upstream zstd `cParams` -> matcher translation documented on each
544    //    config struct.
545    match params.search {
546        SearchMethod::Fast => {
547            if let Some(fast) = params.fast.as_mut() {
548                if let Some(hash_log) = ov.hash_log {
549                    fast.hash_log = hash_log;
550                }
551                if let Some(min_match) = ov.min_match {
552                    fast.mls = min_match;
553                }
554            }
555        }
556        SearchMethod::DoubleFast => {
557            if let Some(dfast) = params.dfast.as_mut() {
558                // hashLog -> long table, chainLog -> short table (the
559                // dfast secondary index). Both bounds-checked <= 30, so
560                // the `u8` casts are lossless.
561                if let Some(hash_log) = ov.hash_log {
562                    dfast.long_hash_log = hash_log as u8;
563                }
564                if let Some(chain_log) = ov.chain_log {
565                    dfast.short_hash_log = chain_log as u8;
566                }
567            }
568        }
569        SearchMethod::RowHash => {
570            if let Some(row) = params.row.as_mut() {
571                // Row hash-table width override (mirrors dfast `long_hash_log`
572                // / hc `hash_log`). Row has no separate chain table — the
573                // per-row depth comes from `search_log` below — so only
574                // `hash_log` maps here; `chain_log` has no Row analogue.
575                if let Some(hash_log) = ov.hash_log {
576                    row.hash_bits = hash_log as usize;
577                }
578                if let Some(search_log) = ov.search_log {
579                    // Upstream zstd: rowLog = clamp(searchLog, 4, 6);
580                    //        nbAttempts = 1 << min(searchLog, rowLog).
581                    let row_log = (search_log as usize).clamp(4, 6);
582                    row.row_log = row_log;
583                    row.search_depth = 1usize << (search_log as usize).min(row_log);
584                }
585                if let Some(target_length) = ov.target_length {
586                    row.target_len = target_length as usize;
587                }
588                if let Some(min_match) = ov.min_match {
589                    row.mls = min_match as usize;
590                }
591            }
592        }
593        SearchMethod::HashChain | SearchMethod::BinaryTree => {
594            if let Some(hc) = params.hc.as_mut() {
595                if let Some(hash_log) = ov.hash_log {
596                    hc.hash_log = hash_log as usize;
597                }
598                if let Some(chain_log) = ov.chain_log {
599                    hc.chain_log = chain_log as usize;
600                }
601                if let Some(search_log) = ov.search_log {
602                    hc.search_depth = 1usize << search_log;
603                }
604                if let Some(target_length) = ov.target_length {
605                    hc.target_len = target_length as usize;
606                }
607                if let Some(min_match) = ov.min_match {
608                    // Upstream zstd `mls = BOUNDED(4, cParams.minMatch, 6)`: a BT
609                    // min_match override maps into the finder hash width. Only
610                    // the BT body reads `search_mls`; HC/lazy keep 4-byte
611                    // hashing regardless, so this is a no-op for them.
612                    hc.search_mls = (min_match as usize).clamp(4, 6);
613                }
614            }
615        }
616    }
617}
618
619/// Map the resolved runtime strategy to the upstream zstd LDM strategy ordinal
620/// (1..=9) that [`super::ldm::params::LdmParams::adjust_for`] expects.
621/// The collapsed `Lazy` tag splits on `lazy_depth` (lazy = 4, lazy2 = 5).
622#[cfg(feature = "hash")]
623fn ldm_strategy_ordinal(tag: super::strategy::StrategyTag, lazy_depth: u8) -> u32 {
624    use super::strategy::StrategyTag;
625    match tag {
626        StrategyTag::Fast => 1,
627        StrategyTag::Dfast => 2,
628        StrategyTag::Greedy => 3,
629        StrategyTag::Lazy => {
630            if lazy_depth >= 2 {
631                5
632            } else {
633                4
634            }
635        }
636        // Upstream zstd `ZSTD_btlazy2` ordinal.
637        StrategyTag::Btlazy2 => 6,
638        StrategyTag::BtOpt => 7,
639        StrategyTag::BtUltra => 8,
640        StrategyTag::BtUltra2 => 9,
641    }
642}
643
644/// `ceil(log2(size))` of a source-size hint, with a zero hint floored to
645/// [`MIN_WINDOW_LOG`]. This is the single quantization every hint-dependent
646/// matcher parameter is derived from: the window-log cap, the HC / Fast hash
647/// and chain widths, the Dfast / Row table widths, the L22 config buckets, and
648/// the Fast attach-vs-copy cutoff. Two hints sharing this value resolve to the
649/// identical matcher shape, which is why it (not the raw byte count) keys the
650/// primed-dictionary snapshot — see [`PrimedKey`]. Operates on the full `u64`
651/// so callers comparing a hint against a cutoff get the same bucketed decision
652/// here and at the driver, with no `as usize` truncation on 32-bit targets.
653pub(crate) fn source_size_ceil_log(size: u64) -> u8 {
654    if size == 0 {
655        MIN_WINDOW_LOG
656    } else {
657        (64 - (size - 1).leading_zeros()) as u8
658    }
659}
660
661/// Attach-vs-copy cutoff for the Fast strategy, as a ceil-log bucket: a hint at
662/// or below `2^this` (or unknown, `None`) ATTACHES the dictionary (a separate
663/// immutable table scanned in place via the borrowed dual-base kernel); a larger
664/// hint would COPY it into the live table.
665///
666/// We set this to `31` so every dictionary source up to 2 GiB attaches,
667/// diverging from upstream zstd's 8 KiB `ZSTD_shouldAttachDict` cutoff ON
668/// PURPOSE: upstream copy mode copies the small CDict TABLES into the cctx and
669/// still scans the input in place, but our flat-history copy path memmoves the
670/// whole INPUT into history every frame (profiled at 30% `__memmove` + 14%
671/// `__memset` on a reused 1 MiB dict encode). Attach mode scans the caller's
672/// input in place with the dict as a separate prefix base, so it is strictly
673/// faster for every frame size here (measured: 1 MiB dict frame 167 us -> 52 us,
674/// 0.42x of C; 10 KiB 20.4 us -> 4.4 us, 0.17x of C). The dual-base kernel
675/// carries `window_low`, so over-window inputs stay in-window and C-decodable.
676///
677/// `31` is also the largest bucket the borrowed kernel can attach: it stores
678/// virtual positions as `u32` (`cur_abs as u32`), so the maximum attached source
679/// `1 << 31` (plus the dict prefix) stays below `u32::MAX`; the next bucket `32`
680/// (4 GiB) would wrap that arithmetic. Sources past 2 GiB therefore fall back to
681/// copy mode — rare in practice, and the relative copy cost shrinks as the
682/// source grows. Per the drop-in-not-binary-parity contract, we make this match
683/// decision ourselves.
684/// Shared by `reset` (records the mode in the primed-snapshot key) and
685/// `prime_with_dictionary` (acts on it).
686pub(crate) const FAST_ATTACH_DICT_CUTOFF_LOG: u8 = 31;
687
688/// Largest dictionary region (bytes) the Fast attach path can index. The tagged
689/// dict table packs each position into `32 - DICT_TAG_BITS` (= 24) bits, so a
690/// region past `2^24` (16 MiB) would overflow the packed position. Dictionaries
691/// this large fall back to COPY mode, whose live table stores full `u32`
692/// positions and handles them. The size hint set on dict load equals the actual
693/// dict content length, so the attach-vs-copy decision (and the matching
694/// snapshot-key / epoch bits) can gate on it consistently at reset time.
695pub(crate) const MAX_FAST_ATTACH_DICT_REGION: usize = 1 << 24;
696
697/// Dfast counterpart of [`FAST_ATTACH_DICT_CUTOFF_LOG`]: upstream zstd
698/// `ZSTD_dictMatchState` attach cutoff for the double-fast strategy is 16 KiB
699/// (`2^14`), so small / unknown-size inputs ATTACH (separate immutable dict
700/// long+short tables + dual-probe in `start_matching_fast_loop`) and larger
701/// known-size inputs COPY (re-prime the dict into the live tables, where the
702/// dense scan matches it as window history). The attach build also self-gates
703/// on `use_fast_loop` inside `skip_matching_for_dict_attach` — only the
704/// fast-loop levels (L3 / Default / L0) carry the dual-probe.
705const DFAST_ATTACH_DICT_CUTOFF_LOG: u8 = 14;
706
707/// `ZSTD_dictMatchState` attach cutoff for the Row (greedy/lazy) strategy is
708/// 32 KiB (`2^15`, upstream zstd `attachDictSizeCutoffs`): small / unknown-size inputs
709/// ATTACH the dict into the separate immutable row index (bounded dual-probe in
710/// `row_candidate_rl`), larger known-size inputs dense-COPY into the live rows.
711const ROW_ATTACH_DICT_CUTOFF_LOG: u8 = 15;
712
713/// 32 KiB (`2^15`, upstream zstd `attachDictSizeCutoffs[ZSTD_lazy2]`): small /
714/// unknown-size inputs ATTACH the dict as a separate hash-chain dms (the dual
715/// search in `find_best_match` walks the live input chain + the dms), larger
716/// known-size inputs dense-COPY (merge the dict into the live chain and search
717/// the one combined chain).
718const HC_ATTACH_DICT_CUTOFF_LOG: u8 = 15;
719
720/// BT/optimal attach cutoff for `btlazy2` + `btopt`: 32 KiB (`2^15`, upstream
721/// zstd `attachDictSizeCutoffs[ZSTD_btlazy2]` == `[ZSTD_btopt]`). Small /
722/// unknown-size inputs ATTACH the dict as a separate DUBT dms; larger known-size
723/// inputs COPY the dict into the LIVE binary tree (upstream zstd
724/// `ZSTD_resetCCtx_byCopyingCDict`).
725const BT_OPT_ATTACH_DICT_CUTOFF_LOG: u8 = 15;
726
727/// BT/optimal attach cutoff for `btultra` + `btultra2`: 8 KiB (`2^13`, upstream
728/// zstd `attachDictSizeCutoffs[ZSTD_btultra]` == `[ZSTD_btultra2]`). The deepest
729/// parses copy the dict into the live tree past a much smaller source than the
730/// `btopt` tier, matching upstream's per-strategy cutoff table.
731const BT_ULTRA_ATTACH_DICT_CUTOFF_LOG: u8 = 13;
732
733// Source-size cap for the dfast hash bits when a size hint is present: a tiny
734// input needs no larger hash than its window. The upstream zstd `cParams.hashLog` /
735// `chainLog` (from `DfastConfig`) caps it from above at the call site.
736fn dfast_hash_bits_for_window(max_window_size: usize) -> usize {
737    let window_log = (usize::BITS - 1 - max_window_size.leading_zeros()) as usize;
738    window_log.max(MIN_WINDOW_LOG as usize)
739}
740
741fn row_hash_bits_for_window(max_window_size: usize) -> usize {
742    // Upstream zstd `ZSTD_adjustCParams_internal` cap: `hashLog <= windowLog + 1`.
743    // The `+ 1` is load-bearing for L12, whose upstream zstd hashLog (23) exceeds
744    // its windowLog (22) — a plain `windowLog` cap would shrink the L12
745    // table on EVERY hinted reset and split primed snapshots between
746    // hinted and unhinted frames that resolve to the identical geometry.
747    // No constant upper clamp: the old `ROW_HASH_BITS` (20) ceiling
748    // predates the lazy band moving onto Row (L9-12 carry upstream zstd hashLog
749    // 21-23).
750    let window_log = (usize::BITS - 1 - max_window_size.leading_zeros()) as usize;
751    (window_log + 1).max(MIN_WINDOW_LOG as usize)
752}
753
754/// `floor(log2(window))` for the HashChain table-log cap (upstream zstd
755/// `ZSTD_adjustCParams_internal`). The caller clamps the level's `hash_log` /
756/// `chain_log` from above with this so a small hinted input doesn't allocate the
757/// full level's tables.
758fn hc_hash_bits_for_window(max_window_size: usize) -> usize {
759    let window_log = (usize::BITS - 1 - max_window_size.leading_zeros()) as usize;
760    window_log.max(MIN_WINDOW_LOG as usize)
761}
762
763/// Parameter table for numeric compression levels 1–22.
764///
765/// Each entry maps a zstd compression level to the best-available matcher
766/// backend and tuning knobs. High levels map to dedicated parse modes:
767/// btopt (16-17), btultra (18), btultra2 (19-22) — matching upstream zstd
768/// `clevels.h` (level 19 is `ZSTD_btultra2`, not plain btultra).
769///
770/// Index 0 = level 1, index 21 = level 22.
771#[rustfmt::skip]
772const LEVEL_TABLE: [LevelParams; 22] = [
773    // Exactly one of fast/dfast/hc/row is Some per row, matching the strategy
774    // backend; the rest are None (not dead placeholders).
775    // Lvl  Strategy       wlog  lazy  per-strategy config
776    // ---  -------------- ----  ----  -------------------
777    /* 1 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Fast, search: super::strategy::SearchMethod::Fast, window_log: 19, lazy_depth: 0, fast: Some(FAST_L1), dfast: None, hc: None, row: None },
778    /* 2 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Fast, search: super::strategy::SearchMethod::Fast, window_log: 20, lazy_depth: 0, fast: Some(FAST_L2), dfast: None, hc: None, row: None },
779    /* 3 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Dfast, search: super::strategy::SearchMethod::DoubleFast, window_log: 21, lazy_depth: 1, fast: None, dfast: Some(DFAST_L3), hc: None, row: None },
780    /* 4 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Dfast, search: super::strategy::SearchMethod::DoubleFast, window_log: 21, lazy_depth: 1, fast: None, dfast: Some(DFAST_L4), hc: None, row: None },
781    // target_len column for L5..=L15 matches upstream zstd cParams.targetLength
782    // from clevels.h table[0] (default — srcSize > 256 KB). Upstream zstd uses
783    // it as the lazy outer loop's `sufficient_len` (nice-match) threshold.
784    // Inflating it above upstream zstd forces the chain walk to complete
785    // search_depth iterations instead of breaking on the first
786    // long-enough match — the dominant cost in the L5..=L15 speed
787    // regression vs FFI (see lazy_band_target_len_matches_default_table).
788    /* 5 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Greedy, search: super::strategy::SearchMethod::RowHash, window_log: 21, lazy_depth: 0, fast: None, dfast: None, hc: None, row: Some(ROW_L5) },
789    // L6-12: the upstream zstd runs the lazy/lazy2 strategies on the ROW-based
790    // match finder by default (`ZSTD_resolveRowMatchFinderMode`: row mode
791    // is on for greedy..lazy2 whenever SIMD is available) — a bounded
792    // SIMD tag scan per row instead of a pointer-chasing hash-chain walk.
793    // Our HashChain walk on these levels was ~75% of L10 wall time on the
794    // 1 MiB corpus (dependent chain-table loads). Same `RowConfig`
795    // derivation as `ROW_L5` above, upstream zstd values per level in the
796    // `ROW_L6..ROW_L12` comment block.
797    /* 6 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Lazy, search: super::strategy::SearchMethod::RowHash, window_log: 21, lazy_depth: 1, fast: None, dfast: None, hc: None, row: Some(ROW_L6) },
798    /* 7 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Lazy, search: super::strategy::SearchMethod::RowHash, window_log: 21, lazy_depth: 1, fast: None, dfast: None, hc: None, row: Some(ROW_L7) },
799    /* 8 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Lazy, search: super::strategy::SearchMethod::RowHash, window_log: 21, lazy_depth: 2, fast: None, dfast: None, hc: None, row: Some(ROW_L8) },
800    /* 9 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Lazy, search: super::strategy::SearchMethod::RowHash, window_log: 22, lazy_depth: 2, fast: None, dfast: None, hc: None, row: Some(ROW_L9) },
801    /*10 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Lazy, search: super::strategy::SearchMethod::RowHash, window_log: 22, lazy_depth: 2, fast: None, dfast: None, hc: None, row: Some(ROW_L10) },
802    /*11 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Lazy, search: super::strategy::SearchMethod::RowHash, window_log: 22, lazy_depth: 2, fast: None, dfast: None, hc: None, row: Some(ROW_L11) },
803    /*12 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Lazy, search: super::strategy::SearchMethod::RowHash, window_log: 22, lazy_depth: 2, fast: None, dfast: None, hc: None, row: Some(ROW_L12) },
804    // L13-15: reference uses btlazy2 (binary-tree finder) with searchLog 4/5/6
805    // (search_depth 16/32/64) and targetLength 32. We run the hash-chain Lazy
806    // parser here, so we mirror the reference search budget rather than inflate
807    // it: matching the table keeps speed near the reference and makes per-level
808    // perf divergences comparable. The binary-tree finder that would let a
809    // smaller searchLog find longer matches (and re-establish a strict ratio
810    // ladder above L12) is tracked separately; until it lands these levels sit
811    // close to L12 on hash-chain inputs by design.
812    /*13 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Btlazy2, search: super::strategy::SearchMethod::BinaryTree, window_log: 22, lazy_depth: 2, fast: None, dfast: None, hc: Some(HcConfig { hash_log: 22, chain_log: 22, search_depth: 16, target_len: 32, search_mls: 5 }), row: None },
813    /*14 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Btlazy2, search: super::strategy::SearchMethod::BinaryTree, window_log: 22, lazy_depth: 2, fast: None, dfast: None, hc: Some(HcConfig { hash_log: 23, chain_log: 22, search_depth: 32, target_len: 32, search_mls: 5 }), row: None },
814    /*15 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Btlazy2, search: super::strategy::SearchMethod::BinaryTree, window_log: 22, lazy_depth: 2, fast: None, dfast: None, hc: Some(HcConfig { hash_log: 23, chain_log: 23, search_depth: 64, target_len: 32, search_mls: 5 }), row: None },
815    /*16 */ LevelParams { strategy_tag: super::strategy::StrategyTag::BtOpt, search: super::strategy::SearchMethod::BinaryTree, window_log: 22, lazy_depth: 2, fast: None, dfast: None, hc: Some(HcConfig { hash_log: 22, chain_log: 22, search_depth: 32, target_len: 48, search_mls: 5 }), row: None },
816    /*17 */ LevelParams { strategy_tag: super::strategy::StrategyTag::BtOpt, search: super::strategy::SearchMethod::BinaryTree, window_log: 23, lazy_depth: 2, fast: None, dfast: None, hc: Some(HcConfig { hash_log: 22, chain_log: 23, search_depth: 32, target_len: 64, search_mls: 4 }), row: None },
817    /*18 */ LevelParams { strategy_tag: super::strategy::StrategyTag::BtUltra, search: super::strategy::SearchMethod::BinaryTree, window_log: 23, lazy_depth: 2, fast: None, dfast: None, hc: Some(HcConfig { hash_log: 22, chain_log: 23, search_depth: 64, target_len: 64, search_mls: 4 }), row: None },
818    /*19 */ LevelParams { strategy_tag: super::strategy::StrategyTag::BtUltra2, search: super::strategy::SearchMethod::BinaryTree, window_log: 23, lazy_depth: 2, fast: None, dfast: None, hc: Some(HcConfig { hash_log: 22, chain_log: 24, search_depth: 128, target_len: 256, search_mls: 4 }), row: None },
819    /*20 */ LevelParams { strategy_tag: super::strategy::StrategyTag::BtUltra2, search: super::strategy::SearchMethod::BinaryTree, window_log: 25, lazy_depth: 2, fast: None, dfast: None, hc: Some(HcConfig { hash_log: 23, chain_log: 25, search_depth: 128, target_len: 256, search_mls: 4 }), row: None },
820    /*21 */ LevelParams { strategy_tag: super::strategy::StrategyTag::BtUltra2, search: super::strategy::SearchMethod::BinaryTree, window_log: 26, lazy_depth: 2, fast: None, dfast: None, hc: Some(BTULTRA2_HC_CONFIG), row: None },
821    /*22 */ LevelParams { strategy_tag: super::strategy::StrategyTag::BtUltra2, search: super::strategy::SearchMethod::BinaryTree, window_log: 27, lazy_depth: 2, fast: None, dfast: None, hc: Some(BTULTRA2_HC_CONFIG_L22), row: None },
822];
823
824/// Upstream `ZSTD_createCDict` table geometry: the `(hash_log, chain_log)` a
825/// dictionary's prepared match-finder tables get. Thin adapter over the single
826/// cParams source [`super::cparams::create_cdict_table_logs`], which mirrors
827/// `ZSTD_adjustCParams_internal` under `ZSTD_cpm_createCDict`. `window_log` is
828/// the resolved compress window; `hash_log` / `chain_log` are the level's own
829/// widths; `uses_bt` selects the binary-tree `cycleLog` (`chainLog - 1`).
830fn cdict_table_logs(
831    window_log: u8,
832    hash_log: usize,
833    chain_log: usize,
834    uses_bt: bool,
835    dict_size: usize,
836) -> (usize, usize) {
837    let (h, c) = super::cparams::create_cdict_table_logs(
838        window_log,
839        hash_log as u32,
840        chain_log as u32,
841        uses_bt,
842        dict_size,
843    );
844    (h as usize, c as usize)
845}
846
847/// Smallest window_log the encoder will use regardless of source size.
848pub(crate) const MIN_WINDOW_LOG: u8 = 10;
849/// Conservative floor for source-size-hinted window tuning.
850///
851/// Hinted windows below 16 KiB (`window_log < 14`) currently regress C-FFI
852/// interoperability on certain compressed-block patterns. Keep hinted
853/// windows at 16 KiB or larger until that compatibility gap is closed.
854const MIN_HINTED_WINDOW_LOG: u8 = 14;
855
856/// Adjust level parameters for a known source size.
857///
858/// This derives a cap from `ceil(log2(src_size))`, then clamps it to
859/// [`MIN_HINTED_WINDOW_LOG`] (16 KiB). A zero-byte size hint is treated as
860/// [`MIN_WINDOW_LOG`] for the raw ceil-log step and then promoted to the hinted
861/// floor. This keeps tables bounded for small inputs while preserving the
862/// encoder's baseline minimum supported window.
863/// For the HC backend, `hash_log` and `chain_log` are reduced
864/// proportionally.
865/// Source-size tier index, matching upstream `ZSTD_getCParams_internal`'s
866/// `tableID = (rSize<=256K)+(rSize<=128K)+(rSize<=16K)`: 0 = > 256 KiB or
867/// unknown, 1 = 128..256 KiB, 2 = 16..128 KiB, 3 = <= 16 KiB.
868fn cparams_tier(source_size: Option<u64>) -> usize {
869    match source_size {
870        Some(size) if size <= 16 * 1024 => 3,
871        Some(size) if size <= 128 * 1024 => 2,
872        Some(size) if size <= 256 * 1024 => 1,
873        _ => 0,
874    }
875}
876
877/// Override a Fast (L1/L2) or Dfast (L3) level row's table-shaping cParams
878/// (hashLog / chainLog / minMatch) by source-size tier, matching the
879/// reference `ZSTD_defaultCParameters[tableID][level]`. L1 keeps its base
880/// hashLog (the source-size window clamp in `adjust_params_for_source_size`
881/// already lands on the reference value) and only tiers minMatch; L2 also
882/// tiers hashLog (the tier-0 value 16 oversized the table on medium inputs,
883/// the page-fault pathology); L3 tiers both dfast hash widths. Strategy
884/// switches (L2 tier 1, L4) are intentionally not applied here.
885fn apply_cparams_tier(level: i32, source_size: Option<u64>, p: &mut LevelParams) {
886    let tier = cparams_tier(source_size);
887    // Single source for the table data: the verbatim upstream
888    // `ZSTD_defaultCParameters[tier][level]` row (`cparams::default_cparams`).
889    // The encoder consumes only the table-shaping widths here; the window /
890    // `table_log` clamp lives in `adjust_params_for_source_size`.
891    match level {
892        // Fast, all tiers — minMatch only (hashLog handled by the window clamp).
893        1 => {
894            if let Some(f) = p.fast.as_mut() {
895                f.mls = super::cparams::default_cparams(tier, 1).min_match;
896            }
897        }
898        // Fast (base strategy; tier 1 is dfast upstream — not switched here).
899        2 => {
900            if let Some(f) = p.fast.as_mut() {
901                let cp = super::cparams::default_cparams(tier, 2);
902                f.hash_log = cp.hash_log;
903                f.mls = cp.min_match;
904            }
905        }
906        // Dfast, all tiers — long hashLog (`hash_log`) + short chainLog (`chain_log`).
907        3 => {
908            if let Some(d) = p.dfast.as_mut() {
909                let cp = super::cparams::default_cparams(tier, 3);
910                d.long_hash_log = cp.hash_log as u8;
911                d.short_hash_log = cp.chain_log as u8;
912            }
913        }
914        _ => {}
915    }
916}
917
918fn adjust_params_for_source_size(mut params: LevelParams, src_size: u64) -> LevelParams {
919    // Derive a source-size-based cap from ceil(log2(src_size)), then
920    // clamp first to MIN_WINDOW_LOG (baseline encoder minimum) and then to
921    // MIN_HINTED_WINDOW_LOG (16 KiB hinted floor). For tiny or zero hints we
922    // therefore keep a 16 KiB effective minimum window in hinted mode.
923    // Raw ceil(log2(src_size)) drives the internal table sizes. The
924    // advertised `window_log` is separately floored at MIN_HINTED_WINDOW_LOG
925    // (a decoder-interop requirement on the wire format), but the hash /
926    // chain table widths are internal and never appear in the frame, so they
927    // can track the actual source size below that floor.
928    let raw_src_log = source_size_ceil_log(src_size);
929    let src_log = raw_src_log.max(MIN_WINDOW_LOG).max(MIN_HINTED_WINDOW_LOG);
930    if src_log < params.window_log {
931        params.window_log = src_log;
932    }
933    // Internal match-finder tables are sized from `table_log` — the RAW
934    // source log (floored only at the baseline `MIN_WINDOW_LOG`), NOT the
935    // wire `window_log` floor. The table widths never appear in the frame, so
936    // for small inputs they can track the actual source size and avoid
937    // zeroing a window-sized table per frame; large inputs keep the level's
938    // widths. The cap is applied with the same per-backend headroom the
939    // level table uses, so the load factor (and match quality) is unchanged.
940    // The Dfast backend derives its table widths from the source in `reset`
941    // (`set_hash_bits` recomputes there), so it is not adjusted here. The Row
942    // backend's width IS capped here, mirroring the upstream zstd (see the Row branch).
943    let table_log = raw_src_log.max(MIN_WINDOW_LOG);
944    let backend = params.backend();
945    if backend == super::strategy::BackendTag::HashChain {
946        let hc = params
947            .hc
948            .as_mut()
949            .expect("HashChain level row carries an HcConfig");
950        if (table_log + 2) < hc.hash_log as u8 {
951            hc.hash_log = (table_log + 2) as usize;
952        }
953        if (table_log + 1) < hc.chain_log as u8 {
954            hc.chain_log = (table_log + 1) as usize;
955        }
956    } else if backend == super::strategy::BackendTag::Row {
957        let row = params
958            .row
959            .as_mut()
960            .expect("Row level row carries a RowConfig");
961        // Upstream zstd `ZSTD_adjustCParams_internal` (zstd_compress.c): once
962        // the window is source-capped, `hashLog <= windowLog + 1`. The row
963        // table is `2^hash_bits` slots, exactly upstream's row hashTable
964        // `2^hashLog` slots, so the same cap applies. Without it the row table
965        // stays at the level's unbounded width (e.g. L12 hash_bits 23 = 4x
966        // upstream's source-capped 21), the dominant peak-memory excess on the
967        // row band.
968        let row_cap = (table_log + 1) as usize;
969        if row_cap < row.hash_bits {
970            row.hash_bits = row_cap;
971        }
972    } else if backend == super::strategy::BackendTag::Simple {
973        let fast = params
974            .fast
975            .as_mut()
976            .expect("Fast level row carries a FastConfig");
977        let fast_cap = (table_log + 1) as u32;
978        if fast_cap < fast.hash_log {
979            fast.hash_log = fast_cap;
980        }
981    }
982    params
983}
984
985fn level22_btultra2_params_for_source_size(source_size: Option<u64>) -> LevelParams {
986    let mut hc = match source_size {
987        Some(size) if size <= 16 * 1024 => BTULTRA2_HC_CONFIG_L22_16K,
988        Some(size) if size <= 128 * 1024 => BTULTRA2_HC_CONFIG_L22_128K,
989        Some(size) if size <= 256 * 1024 => BTULTRA2_HC_CONFIG_L22_256K,
990        _ => BTULTRA2_HC_CONFIG_L22,
991    };
992    let mut window_log = match source_size {
993        Some(size) if size <= 16 * 1024 => 14,
994        Some(size) if size <= 128 * 1024 => 17,
995        Some(size) if size <= 256 * 1024 => 18,
996        _ => 27,
997    };
998    if let Some(size) = source_size
999        && size > 256 * 1024
1000    {
1001        let src_log = source_size_ceil_log(size);
1002        window_log = window_log.min(src_log.max(MIN_WINDOW_LOG));
1003        let adjusted_table_log = window_log as usize + 1;
1004        hc.hash_log = hc.hash_log.min(adjusted_table_log);
1005        hc.chain_log = hc.chain_log.min(adjusted_table_log);
1006    }
1007    LevelParams {
1008        strategy_tag: super::strategy::StrategyTag::BtUltra2,
1009        search: super::strategy::SearchMethod::BinaryTree,
1010        window_log,
1011        lazy_depth: 2,
1012        fast: None,
1013        dfast: None,
1014        hc: Some(hc),
1015        row: None,
1016    }
1017}
1018
1019/// Estimated steady-state heap footprint of a one-shot compression context
1020/// at `level` (window history + match-finder tables + block staging), in
1021/// bytes. Computed from the same per-level tuning table the encoder
1022/// resolves at frame start, so the estimate tracks the real allocations;
1023/// it is an upper-bound style budget figure, not an exact accounting.
1024pub fn estimated_compression_workspace_bytes(level: CompressionLevel) -> usize {
1025    use super::strategy::StrategyTag;
1026    let params = resolve_level_params(level, None);
1027    let window = 1usize << params.window_log;
1028    // Mirror `configure()`: the HC3 short-match side table exists only on
1029    // the btultra/btultra2 tags (minMatch 3), capped by the window log; the
1030    // BT pointer-pair layout fits inside the `4 << chain_log` chain term
1031    // (pairs over `chain_log - 1` nodes).
1032    let wants_hash3 = matches!(
1033        params.strategy_tag,
1034        StrategyTag::BtUltra | StrategyTag::BtUltra2
1035    );
1036    let uses_bt = matches!(
1037        params.strategy_tag,
1038        StrategyTag::Btlazy2 | StrategyTag::BtOpt | StrategyTag::BtUltra | StrategyTag::BtUltra2
1039    );
1040    let tables = params.fast.map(|f| 4usize << f.hash_log).unwrap_or(0)
1041        + params
1042            .dfast
1043            .map(|d| (4usize << d.long_hash_log) + (4usize << d.short_hash_log))
1044            .unwrap_or(0)
1045        + params
1046            .hc
1047            .map(|h| {
1048                let hash3 = if wants_hash3 {
1049                    4usize
1050                        << super::match_table::storage::HC3_HASH_LOG.min(params.window_log as usize)
1051                } else {
1052                    0
1053                };
1054                (4usize << h.hash_log) + (4usize << h.chain_log) + hash3
1055            })
1056            .unwrap_or(0)
1057        + params
1058            .row
1059            .map(|r| (4usize << r.hash_bits) + (2usize << r.hash_bits))
1060            .unwrap_or(0);
1061    // BT modes box a `BtMatcher`; its retained scratch layout is budgeted
1062    // next to the struct so estimator and allocator evolve together.
1063    let bt = if uses_bt {
1064        super::bt::BtMatcher::estimated_workspace_bytes()
1065    } else {
1066        0
1067    };
1068    // Block staging: literal + sequence buffers plus the compressed-block
1069    // scratch, each bounded by the 128 KiB block size.
1070    let staging = 3 * (128 * 1024);
1071    window + tables + bt + staging
1072}
1073
1074/// Extra steady-state workspace the binary-tree strategies (ordinals 6..=9,
1075/// btlazy2..btultra2) retain beyond the hash/chain tables: the boxed matcher
1076/// plus its scratch arenas, and the HC3 short-match side table for
1077/// btultra/btultra2 (capped by the window log). 0 for non-BT ordinals.
1078pub fn estimated_bt_strategy_extra_bytes(strategy_ordinal: u32, window_log: u32) -> usize {
1079    if !(6..=9).contains(&strategy_ordinal) {
1080        return 0;
1081    }
1082    let hash3 = if matches!(strategy_ordinal, 8 | 9) {
1083        4usize << super::match_table::storage::HC3_HASH_LOG.min(window_log as usize)
1084    } else {
1085        0
1086    };
1087    super::bt::BtMatcher::estimated_workspace_bytes() + hash3
1088}
1089
1090/// Resolve a [`CompressionLevel`] (+ optional source-size hint) to the
1091/// concrete [`LevelParams`] the matcher runs: strategy tag, search method
1092/// (match-finder), window log, and per-backend config.
1093///
1094/// ## CRITICAL: input size changes the match-finder (and can change strategy)
1095///
1096/// The resolved geometry is a function of the SOURCE SIZE, not the level
1097/// alone. This is the easy-to-miss part (so read this before assuming a level
1098/// maps to one fixed match-finder). It mirrors three upstream zstd stages:
1099///
1100/// 1. [`LEVEL_TABLE`] holds the tier-0 (source > 256 KiB) base row per level
1101///    (upstream `ZSTD_defaultCParameters[0]`). L6-L12 carry
1102///    `SearchMethod::RowHash` (the Row match-finder), like upstream's
1103///    greedy/lazy default.
1104/// 2. [`apply_cparams_tier`] overrides the table-shaping widths for the
1105///    smaller source tiers (upstream `ZSTD_getCParams_internal` tier table).
1106///    NOTE: upstream ALSO switches STRATEGY in some tiers (L2 → dfast, L4 →
1107///    greedy on small sources); those backend switches are NOT yet replicated,
1108///    so those levels keep their base strategy on small inputs.
1109/// 3. [`adjust_params_for_source_size`] caps `window_log` to
1110///    ~`ceil_log2(source_size)` (upstream `ZSTD_adjustCParams_internal`).
1111///
1112/// THEN, in the matcher `reset`, the greedy/lazy band falls back from
1113/// `RowHash` to `SearchMethod::HashChain` when the resolved `window_log <= 14`
1114/// — exactly upstream's `ZSTD_resolveRowMatchFinderMode` (the Row match-finder
1115/// is used for greedy/lazy/lazy2 ONLY when `windowLog > 14`). Net effect for
1116/// the SAME level:
1117///
1118/// * small input (e.g. a 10 KiB fixture → `window_log` 14) → **HashChain**
1119///   (`ZSTD_HcFindBestMatch`, scalar chain walk);
1120/// * large input (e.g. 1 MiB → `window_log` 20) → **RowHash** (the SIMD-tag
1121///   row match-finder).
1122///
1123/// A dictionary does NOT change the match-finder: it only downsizes the
1124/// prepared tables (`cdict_table_logs`, mirroring `ZSTD_createCDict`'s
1125/// small-source assumption), while `window_log` stays source-derived. So
1126/// `(L6, 10 KiB, +dict)` is HashChain and `(L6, 1 MiB, +dict)` is RowHash,
1127/// both matching upstream. When comparing against C on a fixture, resolve the
1128/// match-finder from the fixture's size first, or you may optimise/benchmark a
1129/// path C does not even take for that input.
1130fn resolve_level_params(level: CompressionLevel, source_size: Option<u64>) -> LevelParams {
1131    if matches!(level, CompressionLevel::Level(22)) {
1132        return level22_btultra2_params_for_source_size(source_size);
1133    }
1134    let params = match level {
1135        CompressionLevel::Uncompressed => LevelParams {
1136            strategy_tag: super::strategy::StrategyTag::Fast,
1137            search: super::strategy::SearchMethod::Fast,
1138            // Uncompressed frames emit raw blocks and never reference
1139            // history; advertising a larger window only inflates
1140            // decoder-side buffer reservation. Stay at 17 (128 KiB).
1141            window_log: 17,
1142            lazy_depth: 0,
1143            // Beyond-upstream zstd: hash_log=14 (vs upstream zstd's 13) for 2× fewer
1144            // collisions on structured corpora. Upstream zstd's "base for negative"
1145            // row has targetLength=1 → step_size = 1 + 0 + 1 = 2.
1146            fast: Some(FastConfig {
1147                hash_log: 14,
1148                mls: 6,
1149                step_size: 2,
1150            }),
1151            dfast: None,
1152            hc: None,
1153            row: None,
1154        },
1155        CompressionLevel::Fastest => {
1156            // Only the Fast-specific cParams
1157            // (fast_hash_log / fast_mls / fast_step_size) align
1158            // with Uncompressed / negative-base row. window_log
1159            // stays at LEVEL_TABLE[0]'s value (19) — Fastest still
1160            // does real compression on a full window, unlike
1161            // Uncompressed which clamps to 17.
1162            let mut p = LEVEL_TABLE[0];
1163            p.fast = Some(FastConfig {
1164                hash_log: 14,
1165                mls: 6,
1166                step_size: 2,
1167            });
1168            p
1169        }
1170        CompressionLevel::Default => {
1171            // Default == Level(DEFAULT_LEVEL); tier it the same way an explicit
1172            // positive level is, so hinted default compression shrinks its
1173            // table widths on small / medium frames instead of keeping the
1174            // tier-0 row (the oversized-table page-fault pathology).
1175            let mut p = LEVEL_TABLE[CompressionLevel::DEFAULT_LEVEL as usize - 1];
1176            apply_cparams_tier(CompressionLevel::DEFAULT_LEVEL, source_size, &mut p);
1177            p
1178        }
1179        CompressionLevel::Better => LEVEL_TABLE[6],
1180        // Level 13: the first dominant point of the deep-lazy band. The
1181        // mls-wide row key lifted the shallow band's ratio enough that
1182        // level 11 no longer strictly beats level 7 on the ladder corpus;
1183        // the `Best` alias belongs on a config that dominates everything
1184        // below it rather than on a hair-thin margin.
1185        CompressionLevel::Best => LEVEL_TABLE[12],
1186        CompressionLevel::Level(n) => {
1187            if n > 0 {
1188                let idx = (n as usize).min(CompressionLevel::MAX_LEVEL as usize) - 1;
1189                let mut p = LEVEL_TABLE[idx];
1190                // Upstream zstd selects the cParams row from a 4-way
1191                // source-size-tiered table (`ZSTD_getCParams_internal` →
1192                // `ZSTD_defaultCParameters[tableID][level]`), and the Fast /
1193                // Dfast hashLog, chainLog and minMatch shrink for smaller
1194                // inputs. The `LEVEL_TABLE` base is the tier-0 (> 256 KiB) row;
1195                // override the table-shaping params per tier here so small and
1196                // medium frames use the reference's table widths (the oversized
1197                // tier-0 widths were a per-frame alloc / page-fault pathology on
1198                // medium inputs) and minMatch (short matches the wide hash
1199                // skips). NOTE: the reference also switches STRATEGY in some
1200                // tiers (L2 → dfast at 128..256 KiB, L4 → greedy at <= 16 KiB
1201                // and 128..256 KiB); those backend switches are not yet tiered,
1202                // so those tiers keep the base strategy.
1203                apply_cparams_tier(n, source_size, &mut p);
1204                p
1205            } else if n == 0 {
1206                // Level 0 = default, matching C zstd semantics. Tier it like the
1207                // `Default` alias so `Level(0)` and `Default` stay identical.
1208                let mut p = LEVEL_TABLE[CompressionLevel::DEFAULT_LEVEL as usize - 1];
1209                apply_cparams_tier(CompressionLevel::DEFAULT_LEVEL, source_size, &mut p);
1210                p
1211            } else {
1212                // Negative levels — upstream zstd sets
1213                // targetLength = -level (clampedCompressionLevel),
1214                // yielding step_size = (-level) + 1 since
1215                // !(targetLength) = 0 when targetLength > 0.
1216                // So L-1..L-7 get step_size 2..8. Acceleration
1217                // gradient comes from larger step skipping more
1218                // positions per iter (faster, worse ratio).
1219                // Clamp to upstream zstd's MIN_LEVEL before negating so
1220                // i32::MIN can't overflow on `-n`.
1221                let clamped = n.max(CompressionLevel::MIN_LEVEL);
1222                let target_length = (-clamped) as usize;
1223                let step_size = target_length + 1;
1224                // Upstream zstd row-0 ("base for negative", clevels.h srcSize>256KB):
1225                // hashLog=13, minMatch=7. The 32 KiB hash table (2^13 * 4B)
1226                // is L1d-resident on contemporary cores, so every probe is an
1227                // L1 hit; hashLog=14 (64 KiB) overflows a 32 KiB L1d and turns
1228                // each probe into an L2 access. minMatch=7 (vs 6) skips
1229                // short-distance 6-byte matches: fewer sequences, less
1230                // extension/emit work, and parity with the upstream zstd's negative
1231                // ladder on both ratio and throughput.
1232                LevelParams {
1233                    strategy_tag: super::strategy::StrategyTag::Fast,
1234                    search: super::strategy::SearchMethod::Fast,
1235                    window_log: 19,
1236                    lazy_depth: 0,
1237                    fast: Some(FastConfig {
1238                        hash_log: 13,
1239                        mls: 7,
1240                        step_size,
1241                    }),
1242                    dfast: None,
1243                    hc: None,
1244                    row: None,
1245                }
1246            }
1247        }
1248    };
1249    if let Some(size) = source_size {
1250        adjust_params_for_source_size(params, size)
1251    } else {
1252        params
1253    }
1254}
1255
1256/// The cheap fingerprint pre-splitter level for a compression level (the
1257/// C-like `blockSplitterLevel`), resolved through the same per-level
1258/// `LevelParams` table as every other tuning knob. `None` keeps the whole
1259/// 128 KiB block. The frame loop reads this instead of hardcoding the
1260/// level→split mapping at the call site.
1261pub(crate) fn level_pre_split(level: CompressionLevel) -> Option<usize> {
1262    // Resolve through `resolve_level_params` directly — NOT via the legacy
1263    // `numeric_level()` alias — so named presets read the SAME table row as
1264    // every other tuning knob (`Best` maps to its own row there, which is
1265    // not the row its numeric alias points at). `Uncompressed` (raw
1266    // blocks) never splits.
1267    if matches!(level, CompressionLevel::Uncompressed) {
1268        return None;
1269    }
1270    resolve_level_params(level, None)
1271        .pre_split()
1272        .map(usize::from)
1273}
1274
1275/// Backend storage for [`MatchGeneratorDriver`]. Exactly one match-finder
1276/// state lives in the driver at a time — the active variant. Backend
1277/// transitions in [`Matcher::reset`] drain the current variant's allocations
1278/// into the shared `vec_pool` and then replace `storage` with a freshly
1279/// constructed variant for the new backend.
1280///
1281/// Replaces the prior pattern of four parallel fields (`match_generator`,
1282/// `dfast_match_generator: Option<…>`, `row_match_generator: Option<…>`,
1283/// `hc_match_generator: Option<…>`) + an `active_backend: BackendTag`
1284/// discriminator: the parallel layout kept drained inner structures
1285/// allocated across backend switches, and every per-frame/per-slice
1286/// driver operation had to dispatch on `active_backend` to pick the
1287/// right field. A single enum collapses the storage and makes the
1288/// dispatcher pattern-match on the storage variant directly — same
1289/// number of arms, but `storage.backend()` is now the canonical source
1290/// of truth and dead variants are dropped when the active backend
1291/// changes.
1292#[derive(Clone)]
1293enum MatcherStorage {
1294    /// Upstream zstd `ZSTD_fast` family. Constructed by
1295    /// [`MatchGeneratorDriver::new`] as the initial variant and
1296    /// re-selected by [`Matcher::reset`] for any [`CompressionLevel`]
1297    /// that `resolve_level_params` maps to [`StrategyTag::Fast`]
1298    /// (`Uncompressed`, `Fastest`, `Level(1)`, and any non-positive
1299    /// `Level(n)` not equal to `0`).
1300    Simple(FastKernelMatcher),
1301    /// Upstream zstd `ZSTD_dfast` family — two-table hash chain. Selected for
1302    /// any level that resolves to [`StrategyTag::Dfast`] in
1303    /// `resolve_level_params` (`Default`, `Level(0)`, `Level(2)`,
1304    /// `Level(3)`).
1305    Dfast(DfastMatchGenerator),
1306    /// Upstream zstd `ZSTD_greedy` family with row hashing. Selected for any
1307    /// level that resolves to [`StrategyTag::Greedy`] (currently
1308    /// `Level(4)` only).
1309    Row(RowMatchGenerator),
1310    /// Upstream zstd `ZSTD_lazy2` and the BT-based optimal modes
1311    /// (`btopt` / `btultra` / `btultra2`). Selected for any level that
1312    /// resolves to [`StrategyTag::Lazy`], [`StrategyTag::BtOpt`],
1313    /// [`StrategyTag::BtUltra`], or [`StrategyTag::BtUltra2`]
1314    /// (`Better`, `Best`, `Level(5..=22)`, and any `Level(n)` with
1315    /// `n > MAX_LEVEL` — `resolve_level_params` clamps positive
1316    /// numeric levels at `MAX_LEVEL = 22` via
1317    /// `Level(n).clamp(1, MAX_LEVEL)`, so `Level(23..=i32::MAX)` all
1318    /// land on `BtUltra2` here). The [`HcMatchGenerator`]'s internal
1319    /// [`HcBackend`] discriminator decides whether BT scratch is
1320    /// allocated.
1321    HashChain(HcMatchGenerator),
1322}
1323
1324impl MatcherStorage {
1325    /// Heap bytes the active backend variant holds (tables, history, scratch).
1326    fn heap_size(&self) -> usize {
1327        match self {
1328            Self::Simple(m) => m.heap_size(),
1329            Self::Dfast(m) => m.heap_size(),
1330            Self::Row(m) => m.heap_size(),
1331            Self::HashChain(m) => m.heap_size(),
1332        }
1333    }
1334
1335    /// [`super::strategy::BackendTag`] family of the active variant.
1336    fn backend(&self) -> super::strategy::BackendTag {
1337        use super::strategy::BackendTag;
1338        match self {
1339            Self::Simple(_) => BackendTag::Simple,
1340            Self::Dfast(_) => BackendTag::Dfast,
1341            Self::Row(_) => BackendTag::Row,
1342            Self::HashChain(_) => BackendTag::HashChain,
1343        }
1344    }
1345}
1346
1347/// This is the default implementation of the `Matcher` trait. It allocates and reuses the buffers when possible.
1348pub struct MatchGeneratorDriver {
1349    vec_pool: Vec<Vec<u8>>,
1350    /// Active match-finder state. Exactly one backend lives here at a
1351    /// time; [`Matcher::reset`] drains the previous variant into
1352    /// `vec_pool` before swapping in a freshly constructed variant for
1353    /// the new backend. `storage.backend()` is the canonical source of
1354    /// truth for the parse family; `strategy_tag` carries the
1355    /// compile-time strategy chosen at the last `reset()`.
1356    storage: MatcherStorage,
1357    // Compile-time strategy tag resolved at `reset()` from the
1358    // requested `CompressionLevel`'s `LevelParams`. The driver's
1359    // hot-block dispatcher in `blocks/compressed.rs` matches on
1360    // this tag to enter the corresponding `Strategy`
1361    // monomorphisation (`compress_block::<S>`).
1362    strategy_tag: super::strategy::StrategyTag,
1363    // Decoupled search-method axis resolved at `reset()` from
1364    // `LevelParams.search`. The per-block dispatcher routes on this
1365    // (not on `strategy_tag`) so a level's parse and search backend can
1366    // be chosen independently. The `BinaryTree` arm still consults
1367    // `strategy_tag` to pick the opt `Strategy` ZST.
1368    search: super::strategy::SearchMethod,
1369    // Decoupled parse-mode axis resolved at `reset()` from
1370    // `LevelParams::parse()`. Independent of `search`: greedy / lazy /
1371    // lazy2 can run on any non-opt search backend. The backends still
1372    // read their own `lazy_depth` (kept in sync at `reset()`); this is
1373    // the authoritative parse selector for the dispatcher.
1374    parse: super::strategy::ParseMode,
1375    /// Test-only per-level recipe override applied in `reset()` before
1376    /// backend selection. Lets the parse×search matrix be exercised
1377    /// without editing `LEVEL_TABLE`; never compiled into production.
1378    #[cfg(test)]
1379    config_override: Option<(super::strategy::SearchMethod, super::strategy::ParseMode)>,
1380    /// Fine-grained per-knob overrides from the public
1381    /// [`super::parameters::CompressionParameters`] surface (#27).
1382    /// `None` (or an all-`None` [`super::parameters::ParamOverrides`])
1383    /// keeps the resolved level geometry byte-identical to plain
1384    /// level-based compression. Applied in [`Matcher::reset`] after the
1385    /// level params are resolved, before backend selection. Persists
1386    /// across resets (it is frame configuration, not a one-shot) until
1387    /// the caller changes it.
1388    param_overrides: Option<super::parameters::ParamOverrides>,
1389    slice_size: usize,
1390    base_slice_size: usize,
1391    // Frame header window size must stay at the configured live-window budget.
1392    // Dictionary retention expands internal matcher capacity only.
1393    reported_window_size: usize,
1394    // Tracks currently retained bytes that originated from primed dictionary
1395    // history and have not been evicted yet.
1396    dictionary_retained_budget: usize,
1397    // Source size hint for next frame (set via set_source_size_hint, cleared on reset).
1398    source_size_hint: Option<u64>,
1399    // Dictionary content size for the next frame (set via set_dictionary_size_hint,
1400    // consumed on reset). When present on a binary-tree / hash-chain backend, the
1401    // match-finder hash/chain tables are sized from the DICTIONARY (upstream zstd CDict
1402    // economics: a loaded dictionary supplies the long matches, so the live tables
1403    // can shrink to the dict's size tier) while the eviction window stays
1404    // source-sized. Mirrors upstream zstd `ZSTD_getCParamRowSize`, which picks the cParams
1405    // table column from `dictSize` for a dictionary-bearing compress.
1406    dictionary_size_hint: Option<usize>,
1407    // Normalized `ceil_log2` bucket of the frame's source-size hint, captured at
1408    // `reset` (where `source_size_hint` is consumed) via [`source_size_ceil_log`].
1409    // `None` means the frame was unhinted. Drives `prime_with_dictionary`'s upstream zstd
1410    // `ZSTD_shouldAttachDict` mode for the Simple/Fast backend: `None` (unknown)
1411    // or `<= FAST_ATTACH_DICT_CUTOFF_LOG` → attach (separate dict table, 2-cursor
1412    // `compress_block_fast_dict`); larger → copy (dictionary primed into the live
1413    // table, 4-cursor `compress_block_fast`). The primed-snapshot key is the
1414    // resolved shape ([`reset_shape`](Self::reset_shape)), not this bucket.
1415    reset_size_log: Option<u8>,
1416    // Whether the loaded dictionary fits the Fast attach path's tagged position
1417    // field (`<= MAX_FAST_ATTACH_DICT_REGION`). Captured at `reset` from the
1418    // dict-size hint (which equals the actual dict length on load) so the Fast
1419    // attach decision, the attach-epoch reset bit, and the primed-snapshot
1420    // `fast_attach` bit all gate on it consistently. `true` when there is no
1421    // dictionary (the attach path is then unused). A dict too large to tag falls
1422    // back to copy mode instead of overflowing the packed position.
1423    reset_dict_attach_ok: bool,
1424    // Hint-resolved matcher shape from the last `reset`: the [`LevelParams`], the
1425    // active backend's applied Dfast/Row hash-table width (`0` for HC/Fast), the
1426    // Fast attach-vs-copy mode, and the active LDM override (#27). Combined with
1427    // the frame's level into the [`PrimedKey`] that keys the primed snapshot, so
1428    // it is only restored into a reset that resolved the identical matcher AND
1429    // LDM configuration. `None` before the first `reset`.
1430    reset_shape: Option<(
1431        LevelParams,
1432        usize,
1433        bool,
1434        Option<super::parameters::LdmOverride>,
1435    )>,
1436    // One-shot borrowed block range `[start, end)` staged by the borrowed
1437    // Fast frame path (`set_borrowed_block`) for the NEXT
1438    // `start_matching` / `skip_matching_with_hint`. `Some` routes that
1439    // call to the Simple backend's borrowed scan instead of the owned
1440    // committed-block path; consumed (reset to `None`) by the routed
1441    // call. Always `None` on the owned streaming path.
1442    borrowed_pending: Option<(usize, usize)>,
1443    /// CDict-equivalent: snapshot of the post-prime matcher state taken
1444    /// once after the first dictionary prime — the backend `storage`
1445    /// (hash tables + dictionary history + offset history + window) plus
1446    /// the driver-level `dictionary_retained_budget`, the only two pieces
1447    /// `prime_with_dictionary` writes. Subsequent frames restore this
1448    /// (a table memcpy) instead of re-hashing every dictionary position,
1449    /// mirroring upstream zstd `ZSTD_compressBegin_usingCDict` copying the
1450    /// precomputed `cdict->matchState`. Invalidated when the dictionary
1451    /// changes; keyed by the [`PrimedKey`] resolved matcher shape so a snapshot
1452    /// is only restored into a reset that produces the same matcher — see
1453    /// `restore_primed_dictionary`.
1454    primed: Option<(MatcherStorage, usize, PrimedKey)>,
1455}
1456
1457/// Identity of the matcher configuration a primed snapshot was captured under:
1458/// the FULLY RESOLVED matcher shape, not the raw source-size hint.
1459///
1460/// `reset()` resolves the hint into a [`LevelParams`] (window_log cap, the
1461/// HC/Fast table and search geometry, the parse depth/target-length that get
1462/// baked into the restored `storage`) plus, for the Dfast/Row backends, a
1463/// table-width derived from the hint's ceil-log bucket. The mapping from hint
1464/// to resolved shape is many-to-one: the source-size adjustment is monotone in
1465/// `ceil_log2(hint)`, and Level 22 additionally collapses several buckets onto
1466/// one upstream zstd tier (its `<= 16/128/256 KiB` thresholds). Keying on the raw hint
1467/// (or even its ceil-log bucket) therefore over-keys — two hints that resolve
1468/// to the identical matcher would each force a full re-prime. Keying on the
1469/// resolved (`params`, `table_bits`) pair restores across them.
1470///
1471/// `table_bits` is the hint-dependent hash-table width the ACTIVE backend
1472/// applied (`set_hash_bits` value for Dfast/Row; `0` for HC/Fast, whose widths
1473/// already live in `params`). The snapshot is only ever captured on the COPY
1474/// path (a hinted, above-cutoff frame), so `table_bits` is always the resolved
1475/// Dfast/Row value there, never the unhinted default.
1476///
1477/// `level` is kept alongside the resolved `params` because some stored matcher
1478/// state is derived from the level DIRECTLY, not through `params`: e.g. Dfast's
1479/// `use_fast_loop` is true for L3 but false for L4, yet L3 and L4 resolve to
1480/// byte-identical `params`. Without `level` a snapshot captured at L3 could be
1481/// restored into an L4 reset, installing the wrong `use_fast_loop`.
1482///
1483/// `fast_attach` records the Fast backend's attach-vs-copy mode
1484/// ([`FAST_ATTACH_DICT_CUTOFF_LOG`]) because that cutoff (8 KiB) falls INSIDE a
1485/// single resolved shape: an 8192- and an 8193-byte Level 1 hint both clamp to
1486/// window_log 14 with identical `params`/`table_bits`, yet 8192 attaches (a
1487/// separate dict table) while 8193 copies into the live table — two different
1488/// `storage` shapes. The frame compressor only captures/restores snapshots on
1489/// the copy path today, but keying on the mode keeps the snapshot identity
1490/// self-sufficient rather than relying on that external gate.
1491///
1492/// Restoring a snapshot whose key differs would reinstate the old `storage`
1493/// (and its `max_window_size` / table dimensions / parse params / dict-table
1494/// shape) under a reset that resolved a different shape — the encoder could
1495/// then search past the frame header's window and emit an undecodable match.
1496/// All fields must match before a restore is allowed.
1497#[derive(Clone, Copy, PartialEq, Eq)]
1498struct PrimedKey {
1499    level: super::CompressionLevel,
1500    params: LevelParams,
1501    table_bits: usize,
1502    fast_attach: bool,
1503    /// Fine-grained LDM override (#27) active at capture time. The
1504    /// snapshot's cloned `storage` carries `BtMatcher::ldm_producer`,
1505    /// which is configured from this override; restoring a snapshot
1506    /// captured under a different LDM configuration (enable flip or
1507    /// changed knobs) would reinstate a stale producer. `params` already
1508    /// pins `window_log` / `strategy_tag` (the rest of the producer's
1509    /// identity), so folding the override completes the LDM identity.
1510    /// `None` = LDM off, matching `ParamOverrides::ldm`.
1511    ldm: Option<super::parameters::LdmOverride>,
1512}
1513
1514impl MatchGeneratorDriver {
1515    /// `slice_size` sets the base block allocation size used for matcher input chunks.
1516    /// `max_slices_in_window` determines the initial window capacity at construction
1517    /// time. Effective window sizing is recalculated on every [`reset`](Self::reset)
1518    /// from the resolved compression level and optional source-size hint.
1519    pub(crate) fn new(slice_size: usize, max_slices_in_window: usize) -> Self {
1520        // Validate inputs before deriving window_log_init. Three
1521        // failure modes need explicit guards:
1522        //
1523        // 1. Zero args → `max_window_size = 0` → silent 1-byte
1524        //    degenerate window (useless).
1525        // 2. Multiplication overflow on `slice_size *
1526        //    max_slices_in_window` → wraps silently in release.
1527        // 3. `next_power_of_two` overflow when the product is
1528        //    above `1 << (usize::BITS - 1)` → modern Rust PANICS
1529        //    on overflow (older Rust returned 0).
1530        //
1531        // Catch all three at construction with a clear domain-
1532        // specific message via `assert!` + `checked_mul` +
1533        // `checked_next_power_of_two`, rather than letting either
1534        // mode produce a silent degenerate matcher OR a generic
1535        // panic deep in `FastKernelMatcher::with_params`.
1536        assert!(
1537            slice_size > 0,
1538            "MatchGeneratorDriver::new requires slice_size > 0 (got 0)",
1539        );
1540        assert!(
1541            max_slices_in_window > 0,
1542            "MatchGeneratorDriver::new requires max_slices_in_window > 0 (got 0)",
1543        );
1544        let max_window_size = max_slices_in_window
1545            .checked_mul(slice_size)
1546            .expect("MatchGeneratorDriver::new: slice_size * max_slices_in_window overflows usize");
1547        // Derive an effective window_log for the initial-state matcher.
1548        // `MatchGeneratorDriver::new` runs BEFORE any reset, so it has
1549        // no LevelParams to consult — we initialise to whatever
1550        // window_log fits the caller's requested max_window_size
1551        // (round up to the next power of two via `next_power_of_two`'s
1552        // log). Reset() overwrites all three params from the resolved
1553        // LevelParams.
1554        //
1555        // `checked_next_power_of_two` returns `None` if the next power
1556        // of two would overflow `usize`. Modern Rust's
1557        // `next_power_of_two` PANICS on overflow rather than returning
1558        // 0 (the panic message is generic and unhelpful), so use the
1559        // checked variant to surface the failure with a clear,
1560        // domain-specific error.
1561        let next_pow2 = max_window_size.checked_next_power_of_two().expect(
1562            "MatchGeneratorDriver::new: max_window_size too large for \
1563             next_power_of_two without overflow",
1564        );
1565        let window_log_init = next_pow2.trailing_zeros() as u8;
1566        Self {
1567            vec_pool: Vec::new(),
1568            // Deferred table: `new` runs before any source size or resolved
1569            // LevelParams exist, so allocating at the level-default hash_log
1570            // here would be thrown away by the first frame's reset (which
1571            // clamps the window to the input and reallocs at the resolved
1572            // size). The deferral lets that first reset allocate exactly once.
1573            storage: MatcherStorage::Simple(FastKernelMatcher::with_params_deferred(
1574                window_log_init,
1575                FAST_LEVEL_1_HASH_LOG,
1576                FAST_LEVEL_1_MLS,
1577                2, // upstream zstd default step_size (targetLength=0 → step=2)
1578            )),
1579            strategy_tag: super::strategy::StrategyTag::Fast,
1580            search: super::strategy::SearchMethod::Fast,
1581            parse: super::strategy::ParseMode::Greedy,
1582            #[cfg(test)]
1583            config_override: None,
1584            param_overrides: None,
1585            slice_size,
1586            base_slice_size: slice_size,
1587            // Report the ROUNDED-UP window size that the matcher
1588            // actually carries (via `window_log_init = log2(next_pow2)`
1589            // → matcher's `max_window_size = 1 << window_log_init =
1590            // next_pow2`). For non-power-of-two `slice_size *
1591            // max_slices_in_window` inputs, the unrounded value
1592            // would under-report the active backend's window until
1593            // the first `reset()` overwrites both sides from the
1594            // resolved LevelParams.
1595            reported_window_size: next_pow2,
1596            reset_size_log: None,
1597            reset_dict_attach_ok: true,
1598            reset_shape: None,
1599            dictionary_retained_budget: 0,
1600            source_size_hint: None,
1601            dictionary_size_hint: None,
1602            borrowed_pending: None,
1603            primed: None,
1604        }
1605    }
1606
1607    fn level_params(level: CompressionLevel, source_size: Option<u64>) -> LevelParams {
1608        resolve_level_params(level, source_size)
1609    }
1610
1611    /// Install the public-parameter per-knob overrides (#27) applied at
1612    /// the next [`Matcher::reset`]. `None` (or an all-`None` set) restores
1613    /// plain level-based geometry. Persists across resets until changed.
1614    pub(crate) fn set_param_overrides(
1615        &mut self,
1616        overrides: Option<super::parameters::ParamOverrides>,
1617    ) {
1618        self.param_overrides = overrides;
1619    }
1620
1621    /// Active backend family derived from the storage variant. Single
1622    /// source of truth — no separate runtime tag to drift against.
1623    pub(crate) fn active_backend(&self) -> super::strategy::BackendTag {
1624        self.storage.backend()
1625    }
1626
1627    /// Whether the borrowed (no-copy, in-place over-window) scan is
1628    /// implemented for the current backend + search configuration. The
1629    /// HashChain backend serves both the lazy CHAIN parser
1630    /// (`SearchMethod::HashChain`) and the BT/optimal parsers
1631    /// (`SearchMethod::BinaryTree`); only the lazy chain has a borrowed scan
1632    /// so far, so BT/optimal stay on the owned path.
1633    pub(crate) fn borrowed_supported(&self) -> bool {
1634        use super::strategy::{BackendTag, SearchMethod, StrategyTag};
1635        match self.active_backend() {
1636            BackendTag::Simple | BackendTag::Dfast | BackendTag::Row => true,
1637            // The HashChain backend covers two searches: the lazy CHAIN parser
1638            // (borrowed-capable) and the BINARY-TREE search (btlazy2 L13-15 +
1639            // optimal BtOpt/BtUltra/BtUltra2 L16-22). btlazy2's BT-tree borrowed
1640            // scan is byte-identical to owned (reads via live_history()), so it
1641            // takes the in-place path. The OPTIMAL parsers stay owned: their
1642            // cost-based DP is sensitive to candidate quality, and the borrowed
1643            // continuous-index scan yields slightly different (ratio-worse)
1644            // candidates than the owned evict+rehash scan — borrowed optimal
1645            // both diverged from owned and fell outside the ffi ratio bound.
1646            // Search-aware (not just strategy_tag) so optimal BT can never be
1647            // staged on the borrowed path even via an internal caller.
1648            BackendTag::HashChain => match self.search {
1649                SearchMethod::HashChain => true,
1650                SearchMethod::BinaryTree => matches!(self.strategy_tag, StrategyTag::Btlazy2),
1651                _ => false,
1652            },
1653        }
1654    }
1655
1656    /// Whether a DICTIONARY frame can take the borrowed (no input copy) path.
1657    /// Only the Simple (Fast) backend with the dictionary ATTACHED (not the
1658    /// copy/merge regime) has a borrowed dict scan — `start_matching_borrowed_dict`
1659    /// reads live matches from the borrowed input in place and dict matches
1660    /// from the committed dict prefix via the 2-segment counter. Every other
1661    /// backend, and copy-mode (large-input) dict frames, stay on the owned
1662    /// path. Checked AFTER priming, so `is_attached()` reflects the resolved
1663    /// attach-vs-copy decision.
1664    pub(crate) fn borrowed_dict_supported(&self) -> bool {
1665        matches!(
1666            &self.storage,
1667            MatcherStorage::Simple(m) if m.dict_is_attached()
1668        )
1669    }
1670
1671    fn simple_mut(&mut self) -> &mut FastKernelMatcher {
1672        match &mut self.storage {
1673            MatcherStorage::Simple(m) => m,
1674            _ => panic!("simple backend must be initialized by reset() before use"),
1675        }
1676    }
1677
1678    /// Reclaim the per-block input buffer that the Simple backend
1679    /// just spent inside `start_matching` / `skip_matching_with_hint`.
1680    ///
1681    /// `FastKernelMatcher::take_recycled_space` returns the cleared
1682    /// (capacity-retained) `Vec<u8>` from the last
1683    /// `extend_history_with_pending`. We push it onto `vec_pool`
1684    /// as-is (with `len = 0`); `get_next_space()` is responsible for
1685    /// resizing the buffer back to `slice_size` on its next pop. The
1686    /// pushed length is irrelevant — only the capacity matters, and
1687    /// `extend_history_with_pending` preserves it. Without this
1688    /// recycle path, the Simple backend would allocate a new
1689    /// `Vec<u8>` per block — a measurable hot-path cost when blocks
1690    /// are small (~128 KiB) and processed at hundreds of MiB/s.
1691    fn recycle_simple_space(&mut self) {
1692        if let Some(space) = self.simple_mut().take_recycled_space() {
1693            // `space` is already cleared (len = 0) by
1694            // `extend_history_with_pending`; capacity is retained.
1695            // Leaving `len = 0` here avoids the cost of zero-filling
1696            // the entire allocation — `get_next_space()` resizes the
1697            // popped buffer up to `slice_size` on demand, so the
1698            // length the pool holds is irrelevant. This matters most
1699            // after a small-source-size hint has shrunk `slice_size`
1700            // mid-frame: the recycled buffer can be much larger than
1701            // the current `slice_size`, and zero-filling 128 KiB+ on
1702            // every block would erase the perf win the recycle path
1703            // is meant to deliver.
1704            self.vec_pool.push(space);
1705        }
1706    }
1707
1708    /// Register a caller-owned input buffer as the Simple backend's
1709    /// borrowed one-shot match window. Only valid on the Simple (Fast)
1710    /// backend; the one-shot frame path gates on that before calling.
1711    ///
1712    /// # Safety
1713    /// Same contract as [`FastKernelMatcher::set_borrowed_window`]: the
1714    /// buffer must stay live and unmodified until the window is cleared,
1715    /// and must be cleared before the buffer is dropped or the matcher is
1716    /// reused for another frame.
1717    pub(crate) unsafe fn set_borrowed_window(&mut self, buffer: &[u8]) {
1718        // SAFETY: forwarded contract — caller upholds liveness/clear.
1719        match self.active_backend() {
1720            super::strategy::BackendTag::Simple => unsafe {
1721                self.simple_mut().set_borrowed_window(buffer)
1722            },
1723            super::strategy::BackendTag::Dfast => unsafe {
1724                self.dfast_matcher_mut().set_borrowed_window(buffer)
1725            },
1726            super::strategy::BackendTag::Row => unsafe {
1727                self.row_matcher_mut().set_borrowed_window(buffer)
1728            },
1729            super::strategy::BackendTag::HashChain => unsafe {
1730                self.hc_matcher_mut().set_borrowed_window(buffer)
1731            },
1732        }
1733    }
1734
1735    /// Clear the borrowed one-shot window, returning the active backend
1736    /// to the owned `history` path.
1737    pub(crate) fn clear_borrowed_window(&mut self) {
1738        match self.active_backend() {
1739            super::strategy::BackendTag::Simple => self.simple_mut().clear_borrowed_window(),
1740            super::strategy::BackendTag::Dfast => self.dfast_matcher_mut().clear_borrowed_window(),
1741            super::strategy::BackendTag::Row => self.row_matcher_mut().clear_borrowed_window(),
1742            super::strategy::BackendTag::HashChain => self.hc_matcher_mut().clear_borrowed_window(),
1743            #[allow(unreachable_patterns)]
1744            _ => {}
1745        }
1746        self.borrowed_pending = None;
1747    }
1748
1749    /// Stage the borrowed block range `[block_start, block_end)` for the
1750    /// NEXT `start_matching` / `skip_matching_with_hint`, which the
1751    /// borrowed Fast frame path uses in place of `commit_space`. While
1752    /// staged, those trait calls route to the Simple backend's borrowed
1753    /// scan/skip (consuming the stage) instead of the owned committed
1754    /// block. See [`Matcher::start_matching`] /
1755    /// [`Matcher::skip_matching_with_hint`] on this type.
1756    pub(crate) fn set_borrowed_block(&mut self, block_start: usize, block_end: usize) {
1757        assert!(
1758            self.borrowed_supported(),
1759            "borrowed block staging is not supported for the active backend/search config",
1760        );
1761        assert!(
1762            block_start <= block_end,
1763            "borrowed block range must satisfy start <= end (start={block_start} end={block_end})",
1764        );
1765        self.borrowed_pending = Some((block_start, block_end));
1766        // Make the range visible to `get_last_space()` immediately: the
1767        // emit pipeline reads `get_last_space().len()` in
1768        // `collect_block_parts` BEFORE `start_matching` consumes the
1769        // stage, so the staged block (not the whole borrowed window) must
1770        // be reported now to keep the literal-buffer reservation right.
1771        match self.active_backend() {
1772            super::strategy::BackendTag::Simple => self
1773                .simple_mut()
1774                .stage_borrowed_block(block_start, block_end),
1775            super::strategy::BackendTag::Dfast => self
1776                .dfast_matcher_mut()
1777                .stage_borrowed_block(block_start, block_end),
1778            super::strategy::BackendTag::Row => self
1779                .row_matcher_mut()
1780                .stage_borrowed_block(block_start, block_end),
1781            super::strategy::BackendTag::HashChain => self
1782                .hc_matcher_mut()
1783                .table
1784                .stage_borrowed_block(block_start, block_end),
1785        }
1786    }
1787
1788    #[cfg(test)]
1789    fn dfast_matcher(&self) -> &DfastMatchGenerator {
1790        match &self.storage {
1791            MatcherStorage::Dfast(m) => m,
1792            _ => panic!("dfast backend must be initialized by reset() before use"),
1793        }
1794    }
1795
1796    fn dfast_matcher_mut(&mut self) -> &mut DfastMatchGenerator {
1797        match &mut self.storage {
1798            MatcherStorage::Dfast(m) => m,
1799            _ => panic!("dfast backend must be initialized by reset() before use"),
1800        }
1801    }
1802
1803    #[cfg(test)]
1804    fn row_matcher(&self) -> &RowMatchGenerator {
1805        match &self.storage {
1806            MatcherStorage::Row(m) => m,
1807            _ => panic!("row backend must be initialized by reset() before use"),
1808        }
1809    }
1810
1811    fn row_matcher_mut(&mut self) -> &mut RowMatchGenerator {
1812        match &mut self.storage {
1813            MatcherStorage::Row(m) => m,
1814            _ => panic!("row backend must be initialized by reset() before use"),
1815        }
1816    }
1817
1818    #[cfg(test)]
1819    fn hc_matcher(&self) -> &HcMatchGenerator {
1820        match &self.storage {
1821            MatcherStorage::HashChain(m) => m,
1822            _ => panic!("hash chain backend must be initialized by reset() before use"),
1823        }
1824    }
1825
1826    fn hc_matcher_mut(&mut self) -> &mut HcMatchGenerator {
1827        match &mut self.storage {
1828            MatcherStorage::HashChain(m) => m,
1829            _ => panic!("hash chain backend must be initialized by reset() before use"),
1830        }
1831    }
1832
1833    /// Shrink the active backend's `max_window_size` by the bytes
1834    /// reclaimed from the dictionary-retention budget. Returns `true`
1835    /// iff any reclamation happened — the caller uses that as the
1836    /// gate for [`Self::trim_after_budget_retire`] (which is a no-op
1837    /// otherwise: with `max_window_size` unchanged the backend's
1838    /// `trim_to_window` cannot find anything to evict, so calling it
1839    /// just runs an extra `match` ladder + a single early-out check
1840    /// per slice commit).
1841    #[must_use]
1842    fn retire_dictionary_budget(&mut self, evicted_bytes: usize) -> bool {
1843        let reclaimed = evicted_bytes.min(self.dictionary_retained_budget);
1844        if reclaimed == 0 {
1845            return false;
1846        }
1847        self.dictionary_retained_budget -= reclaimed;
1848        match self.active_backend() {
1849            super::strategy::BackendTag::Simple => {
1850                let matcher = self.simple_mut();
1851                // `reclaimed` can exceed the CURRENT `max_window_size`: the
1852                // retained dict budget is tracked independently and the
1853                // window may already have been shrunk by a prior eviction,
1854                // so the floor at 0 is the correct clamp, not a masked bug.
1855                matcher.max_window_size = matcher.max_window_size.saturating_sub(reclaimed);
1856            }
1857            super::strategy::BackendTag::Dfast => {
1858                let matcher = self.dfast_matcher_mut();
1859                // `reclaimed` can exceed the CURRENT `max_window_size`: the
1860                // retained dict budget is tracked independently and the
1861                // window may already have been shrunk by a prior eviction,
1862                // so the floor at 0 is the correct clamp, not a masked bug.
1863                matcher.max_window_size = matcher.max_window_size.saturating_sub(reclaimed);
1864            }
1865            super::strategy::BackendTag::Row => {
1866                let matcher = self.row_matcher_mut();
1867                // `reclaimed` can exceed the CURRENT `max_window_size`: the
1868                // retained dict budget is tracked independently and the
1869                // window may already have been shrunk by a prior eviction,
1870                // so the floor at 0 is the correct clamp, not a masked bug.
1871                matcher.max_window_size = matcher.max_window_size.saturating_sub(reclaimed);
1872            }
1873            super::strategy::BackendTag::HashChain => {
1874                let matcher = self.hc_matcher_mut();
1875                // See the Simple arm: `reclaimed` may exceed the current
1876                // window, so saturating to 0 is the correct clamp.
1877                matcher.table.max_window_size =
1878                    matcher.table.max_window_size.saturating_sub(reclaimed);
1879            }
1880        }
1881        true
1882    }
1883
1884    fn trim_after_budget_retire(&mut self) {
1885        loop {
1886            let mut evicted_bytes = 0usize;
1887            match self.active_backend() {
1888                super::strategy::BackendTag::Simple => {
1889                    // FastKernelMatcher owns its history as a single
1890                    // flat `Vec<u8>` (upstream zstd's flat-buffer layout)
1891                    // rather than the legacy per-block `WindowEntry`
1892                    // stack. There are no per-block Vec allocations
1893                    // to recycle into `vec_pool` — `trim_to_window`
1894                    // drains the oldest bytes in-place and returns
1895                    // the count for the dictionary-budget loop's
1896                    // termination check.
1897                    let MatcherStorage::Simple(m) = &mut self.storage else {
1898                        unreachable!("active_backend() == Simple proven above");
1899                    };
1900                    evicted_bytes += m.trim_to_window();
1901                }
1902                super::strategy::BackendTag::Dfast => {
1903                    // Dfast doesn't retain input Vecs — `history` is the
1904                    // only byte store, so there is no per-block buffer
1905                    // to push back through a callback. Eviction byte
1906                    // count is derived from the `window_size` delta
1907                    // before/after; the Dfast variant of
1908                    // `trim_to_window` takes no closure, sidestepping
1909                    // an unused-`impl FnMut` monomorphization that
1910                    // would otherwise contractually never fire.
1911                    let dfast = self.dfast_matcher_mut();
1912                    let pre = dfast.window_size;
1913                    dfast.trim_to_window();
1914                    evicted_bytes += pre - dfast.window_size;
1915                }
1916                super::strategy::BackendTag::Row => {
1917                    // Row keeps bytes only in the contiguous `history` mirror
1918                    // (block buffers are returned to the pool per block in
1919                    // `add_data`), so derive the eviction count from the
1920                    // `window_size` delta, mirroring the Dfast / HashChain arms.
1921                    let row = self.row_matcher_mut();
1922                    let pre = row.window_size;
1923                    row.trim_to_window();
1924                    evicted_bytes += pre - row.window_size;
1925                }
1926                super::strategy::BackendTag::HashChain => {
1927                    // HC keeps bytes only in the contiguous `history` mirror
1928                    // (no per-block Vecs to recycle since the window<->history
1929                    // dedup), so derive the eviction count from the
1930                    // `window_size` delta, mirroring the Dfast arm above.
1931                    let table = &mut self.hc_matcher_mut().table;
1932                    let pre = table.window_size;
1933                    table.trim_to_window();
1934                    evicted_bytes += pre - table.window_size;
1935                }
1936            }
1937            if evicted_bytes == 0 {
1938                break;
1939            }
1940            // The loop's invariant is "the backend's previous
1941            // `max_window_size` shrink had downstream bytes left to
1942            // evict" — that's what `evicted_bytes != 0` proves at
1943            // this point. `dictionary_retained_budget` is NOT
1944            // guaranteed to be positive here: the outer
1945            // `retire_dictionary_budget` call may have already
1946            // drained it to zero by reclaiming the last retained
1947            // bytes, while the backend still has bytes above the
1948            // freshly-shrunk window cap waiting for this loop to
1949            // evict. The return value of the retire call below is
1950            // therefore intentionally discarded — the loop's
1951            // termination is driven by `evicted_bytes == 0`, not by
1952            // whether the budget has more bytes left to reclaim.
1953            let _ = self.retire_dictionary_budget(evicted_bytes);
1954        }
1955    }
1956
1957    /// ATTACH (`true`) vs COPY (`false`) decision for the dms-bearing HashChain
1958    /// backend (lazy hash-chain AND binary-tree/optimal levels), mirroring
1959    /// upstream `ZSTD_shouldAttachDict` and its per-strategy `attachDictSizeCutoffs`:
1960    /// a small / unknown source ATTACHES the dict as a separate dms (hash-chain
1961    /// dms for lazy, DUBT dms for BT); a large known source COPIES it into the
1962    /// live chain / tree. The cutoff is the lazy/lazy2 value for HC, the
1963    /// btlazy2/btopt value for Bt{Opt}, and the smaller btultra/btultra2 value for
1964    /// the deepest parses. Both `skip_matching_for_dictionary_priming` (which
1965    /// stages the dict) and `prime_with_dictionary` (which builds-or-drops the
1966    /// dms) read this so the two stay in lock-step.
1967    fn hc_dict_attach_mode(&self) -> bool {
1968        // Only the HashChain backend (lazy hash-chain + BT/optimal) routes here;
1969        // a non-HashChain storage has no dms decision, so default to attach.
1970        let MatcherStorage::HashChain(hc) = &self.storage else {
1971            return true;
1972        };
1973        let cutoff = if hc.table.uses_bt {
1974            match hc.strategy_tag {
1975                super::strategy::StrategyTag::BtUltra | super::strategy::StrategyTag::BtUltra2 => {
1976                    BT_ULTRA_ATTACH_DICT_CUTOFF_LOG
1977                }
1978                _ => BT_OPT_ATTACH_DICT_CUTOFF_LOG,
1979            }
1980        } else {
1981            HC_ATTACH_DICT_CUTOFF_LOG
1982        };
1983        self.reset_size_log.is_none_or(|log| log <= cutoff)
1984    }
1985
1986    fn skip_matching_for_dictionary_priming(&mut self) {
1987        match self.active_backend() {
1988            super::strategy::BackendTag::Simple => {
1989                // Upstream zstd `ZSTD_shouldAttachDict` mode selection for the Fast
1990                // strategy (cutoff 8 KB): small / unknown-size inputs ATTACH
1991                // (index dict positions into a SEPARATE immutable table; the
1992                // dual-probe 2-cursor `compress_block_fast_dict` then prefers
1993                // recent-input matches and falls back to the dict — the path
1994                // that wins small/unknown). Large known-size inputs COPY (prime
1995                // dict into the live table; the 4-cursor `compress_block_fast`
1996                // matches against it as window history — the path that already
1997                // matches/beats the upstream zstd on large corpora). The dispatch in
1998                // `start_matching` keys off `dict_table.is_some()`, which only
1999                // the attach path populates. See [`FAST_ATTACH_DICT_CUTOFF_LOG`].
2000                let attach = self.reset_dict_attach_ok
2001                    && self
2002                        .reset_size_log
2003                        .is_none_or(|log| log <= FAST_ATTACH_DICT_CUTOFF_LOG);
2004                if attach {
2005                    self.simple_mut().skip_matching_for_dict_prime();
2006                } else {
2007                    self.simple_mut().skip_matching_with_hint(Some(false));
2008                }
2009                self.recycle_simple_space();
2010            }
2011            super::strategy::BackendTag::Dfast => {
2012                // Upstream zstd `ZSTD_dictMatchState` mode selection for dfast (cutoff
2013                // 16 KiB): small / unknown-size inputs ATTACH (build the
2014                // separate immutable dict long+short tables; the dual-probe
2015                // `start_matching_fast_loop` searches live + dict, the path that
2016                // avoids the per-frame dict re-prime that dominates small
2017                // `compress-dict`). Larger known-size inputs COPY (re-prime the
2018                // dict into the live tables via `skip_matching_dense`, where the
2019                // dense scan matches it as window history). `skip_matching_for_dict_attach`
2020                // self-gates on `use_fast_loop` (only fast-loop levels carry the
2021                // dual-probe; general-path levels fall back to the dense copy).
2022                let attach = self
2023                    .reset_size_log
2024                    .is_none_or(|log| log <= DFAST_ATTACH_DICT_CUTOFF_LOG);
2025                if attach {
2026                    self.dfast_matcher_mut().skip_matching_for_dict_attach();
2027                } else {
2028                    self.dfast_matcher_mut().invalidate_dict_cache();
2029                    self.dfast_matcher_mut().skip_matching_dense();
2030                }
2031            }
2032            super::strategy::BackendTag::Row => {
2033                // Upstream zstd `ZSTD_RowFindBestMatch` `dictMatchState`: small /
2034                // unknown-size inputs ATTACH (build the separate immutable dict
2035                // row index; the bounded dual-probe in `row_candidate_rl`
2036                // searches live + dict, avoiding the per-frame dict re-index),
2037                // larger known-size inputs COPY (dense re-prime into the live
2038                // rows).
2039                let attach = self
2040                    .reset_size_log
2041                    .is_none_or(|log| log <= ROW_ATTACH_DICT_CUTOFF_LOG);
2042                if attach {
2043                    self.row_matcher_mut().prime_dict_attach_current_block();
2044                } else {
2045                    self.row_matcher_mut().invalidate_dict_cache();
2046                    self.row_matcher_mut().skip_matching_with_hint(Some(false));
2047                }
2048            }
2049            super::strategy::BackendTag::HashChain => {
2050                // Lazy-HC AND BT/optimal both follow upstream zstd `ZSTD_shouldAttachDict`
2051                // per-strategy: ATTACH (a separate dms — hash-chain dms for lazy,
2052                // DUBT dms for BT) for small / unknown inputs, COPY (merge the dict
2053                // into the live chain/tree) for large known inputs. ATTACH keeps
2054                // the dict in history but out of the live structure via
2055                // `skip_matching_dict_bt` (the cursor advance is shared by both
2056                // arms); COPY routes through the normal `skip_matching` (its
2057                // `uses_bt` branch fills the live tree, the lazy branch the live
2058                // chain). The dms is built-or-dropped to match in
2059                // `prime_with_dictionary`.
2060                if self.hc_dict_attach_mode() {
2061                    self.hc_matcher_mut().table.skip_matching_dict_bt();
2062                } else {
2063                    self.hc_matcher_mut().skip_matching(Some(false));
2064                }
2065            }
2066        }
2067    }
2068}
2069
2070impl Matcher for MatchGeneratorDriver {
2071    fn supports_dictionary_priming(&self) -> bool {
2072        true
2073    }
2074
2075    fn set_source_size_hint(&mut self, size: u64) {
2076        self.source_size_hint = Some(size);
2077    }
2078
2079    fn set_dictionary_size_hint(&mut self, size: usize) {
2080        self.dictionary_size_hint = Some(size);
2081    }
2082
2083    /// Dict-relevance gate for the raw-fast-path. Reached only when a dictionary
2084    /// is active (the caller short-circuits on `dict_active`), so this answers
2085    /// "could the dict compress this otherwise-incompressible-looking block?".
2086    /// The Simple (Fast) backend samples its dict table precisely
2087    /// ([`FastKernelMatcher::block_samples_match_dict`]); the other backends
2088    /// (Dfast / Row / HashChain / BT) have their own dict structures and no cheap
2089    /// probe here, so they answer CONSERVATIVELY `true`: without a probe they
2090    /// cannot tell whether the dict compresses an incompressible-LOOKING block,
2091    /// and answering `false` would let the raw-fast-path emit such a block raw
2092    /// and miss an embedded dict segment. `dictionary_segment_in_incompressible_input_is_matched`
2093    /// pins this for Dfast/Row/BT — the 512-byte dict run inside high-entropy
2094    /// filler is matched only because these backends stay on the scan. So they
2095    /// keep the blanket scan the old `!dict_active` gate gave them; only the
2096    /// Simple/Fast backend trades it for the precise probe.
2097    fn block_samples_match_dict(&self, block: &[u8]) -> bool {
2098        match &self.storage {
2099            MatcherStorage::Simple(m) => m.block_samples_match_dict(block),
2100            _ => true,
2101        }
2102    }
2103
2104    /// Heap bytes this driver owns: the active backend's tables/history, the
2105    /// recycled input-buffer pool, and the primed-dictionary snapshot (a cloned
2106    /// backend kept for CDict-equivalent reuse). The inline struct itself is
2107    /// accounted by the owner's `size_of`.
2108    fn heap_size(&self) -> usize {
2109        let pool: usize = self.vec_pool.capacity() * core::mem::size_of::<Vec<u8>>()
2110            + self.vec_pool.iter().map(Vec::capacity).sum::<usize>();
2111        let snapshot = self
2112            .primed
2113            .as_ref()
2114            .map_or(0, |(storage, _, _)| storage.heap_size());
2115        pool + self.storage.heap_size() + snapshot
2116    }
2117
2118    fn clear_param_overrides(&mut self) {
2119        self.param_overrides = None;
2120    }
2121
2122    fn reset(&mut self, level: CompressionLevel) {
2123        let hint = self.source_size_hint.take();
2124        let dict_hint = self.dictionary_size_hint.take();
2125        // Snapshot the hint's normalized ceil-log bucket for the primed-snapshot
2126        // key and prime_with_dictionary's attach/copy mode decision (the hint is
2127        // consumed here, but priming happens just after reset). Storing the
2128        // bucket rather than the raw bytes means two hints that resolve to the
2129        // same matcher shape share one snapshot instead of each re-priming.
2130        self.reset_size_log = hint.map(source_size_ceil_log);
2131        // A dictionary too large for the tagged attach position field falls back
2132        // to copy mode. Captured here (from the load-set size hint = actual dict
2133        // length) so the prime decision and the snapshot-key / epoch bits agree.
2134        self.reset_dict_attach_ok =
2135            dict_hint.is_none_or(|size| size <= MAX_FAST_ATTACH_DICT_REGION);
2136        let hinted = hint.is_some();
2137        #[cfg_attr(not(test), allow(unused_mut))]
2138        let mut params = Self::level_params(level, hint);
2139        // Test-only: apply a parse×search override so the matrix can be
2140        // exercised without editing `LEVEL_TABLE`. Mutating `params` here
2141        // (before `next_backend`) flows the override through storage
2142        // selection, `configure`, and the `self.search`/`self.parse`
2143        // writes uniformly. Consumed with `take()` so it is one-shot: the
2144        // synthetic pairing applies to exactly this `reset()`, and a later
2145        // reset on the same driver falls back to the level's real config.
2146        #[cfg(test)]
2147        if let Some((search, parse)) = self.config_override.take() {
2148            params.search = search;
2149            params.lazy_depth = parse.lazy_depth();
2150            // The matrix sweep can pair a level with a backend its native
2151            // row doesn't populate (e.g. greedy L5, which carries only `row`,
2152            // run on HashChain). Synthesize a default config for the
2153            // overridden backend so its `configure` arm has something to read.
2154            use super::strategy::SearchMethod;
2155            match search {
2156                SearchMethod::Fast => {
2157                    params.fast.get_or_insert(FAST_L1);
2158                }
2159                SearchMethod::DoubleFast => {
2160                    params.dfast.get_or_insert(DFAST_L3);
2161                }
2162                SearchMethod::RowHash => {
2163                    params.row.get_or_insert(ROW_CONFIG);
2164                }
2165                SearchMethod::HashChain | SearchMethod::BinaryTree => {
2166                    params.hc.get_or_insert(HC_CONFIG);
2167                }
2168            }
2169        }
2170        // Public-parameter overrides (#27): apply the per-knob set on top
2171        // of the level-resolved params. A strategy override re-routes the
2172        // backend, so this must precede `next_backend` selection. The
2173        // all-`None` case is skipped so default level geometry stays
2174        // byte-identical to plain level-based compression.
2175        if let Some(ov) = self.param_overrides
2176            && !ov.is_empty()
2177        {
2178            apply_param_overrides(&mut params, &ov);
2179            // `Self::level_params(level, hint)` applied the source-size cap
2180            // for the LEVEL's native backend. If a strategy override moved
2181            // the frame onto a different backend, `apply_param_overrides`
2182            // synthesized that backend's DEFAULT config (FAST_L1 /
2183            // HC_OVERRIDE_DEFAULT) with full-size table logs AFTER that cap
2184            // ran. Re-apply the hint cap so a tiny hinted frame doesn't
2185            // allocate the new backend's full-size tables. An explicit
2186            // `window_log` override is the user's hard request and must
2187            // survive the re-cap, so restore it afterwards.
2188            if let Some(hint_size) = hint {
2189                params = adjust_params_for_source_size(params, hint_size);
2190                if let Some(window_log) = ov.window_log {
2191                    params.window_log = window_log;
2192                }
2193            }
2194        }
2195        // Dictionary-driven table sizing — parity with upstream zstd `ZSTD_createCDict`
2196        // (`ZSTD_getCParams_internal(level, UNKNOWN, dictSize, ZSTD_cpm_createCDict)`
2197        // → `ZSTD_adjustCParams_internal`). A loaded dictionary supplies the
2198        // long-distance matches, so upstream zstd sizes the prepared match-finder tables
2199        // to the DICTIONARY (assuming a `minSrcSize` source), not the live
2200        // window: it downsizes `hashLog`/`chainLog` toward the dict-and-window
2201        // log while leaving the frame's eviction `window_log` source-derived so
2202        // the dictionary bytes stay referenceable (`ZSTD_resetCCtx_byCopyingCDict`
2203        // copies the small CDict tables but keeps the source window). We apply
2204        // the same downsizing to the level's own hc geometry and cap (min) so a
2205        // dict never inflates the level tables. Only the binary-tree / hash-chain
2206        // backend reads `hc.{hash,chain}_log`; Simple/Dfast/Row derive their
2207        // widths from the source window in their `reset` arms.
2208        // A zero-length dictionary is "no dictionary": running the CDict sizing
2209        // path for `Some(0)` is not a no-op — `cdict_table_logs(.., 0)` still
2210        // collapses the HC/BT tables toward the 513-byte upstream zstd tier via
2211        // `DICT_MIN_SRC_SIZE`, tanking ratio/perf on the next frame. Priming
2212        // already treats empty content as empty, so skip the downsizing here too.
2213        if let Some(dict_size) = dict_hint.filter(|&size| size > 0) {
2214            // Derive the dict-tier geometry from the level's FULL (un-source-capped)
2215            // hc widths. `Self::level_params(level, hint)` already source-capped
2216            // `params.hc`; feeding those capped widths into `cdict_table_logs` and
2217            // then `.min()`-ing would double-cap, so on a small hinted source with a
2218            // large dictionary the prepared tables collapse below what the dict needs
2219            // — defeating the `ZSTD_createCDict` geometry this mirrors. Take the
2220            // un-hinted base widths instead and assign the result directly:
2221            // `cdict_table_logs` only ever downsizes, so it never exceeds the base
2222            // level geometry, while the eviction `window_log` stays source-derived so
2223            // the dictionary bytes remain referenceable. Active public-parameter
2224            // overrides (#27) are applied to the base too, so a strategy override
2225            // that routes onto HashChain/BinaryTree still gets dict-tier sizing and
2226            // explicit hash/chain overrides feed through as the geometry ceiling.
2227            let mut base_params = Self::level_params(level, None);
2228            if let Some(ov) = self.param_overrides
2229                && !ov.is_empty()
2230            {
2231                apply_param_overrides(&mut base_params, &ov);
2232            }
2233            if let (Some(hc), Some(base_hc)) = (params.hc.as_mut(), base_params.hc) {
2234                let uses_bt = matches!(
2235                    params.strategy_tag,
2236                    super::strategy::StrategyTag::Btlazy2
2237                        | super::strategy::StrategyTag::BtOpt
2238                        | super::strategy::StrategyTag::BtUltra
2239                        | super::strategy::StrategyTag::BtUltra2
2240                );
2241                let (dict_hash_log, dict_chain_log) = cdict_table_logs(
2242                    params.window_log,
2243                    base_hc.hash_log,
2244                    base_hc.chain_log,
2245                    uses_bt,
2246                    dict_size,
2247                );
2248                hc.hash_log = dict_hash_log;
2249                hc.chain_log = dict_chain_log;
2250            }
2251        }
2252        // upstream zstd `ZSTD_resolveRowMatchFinderMode` (zstd_compress.c:238-245):
2253        // the row matchfinder is used for greedy/lazy/lazy2 ONLY when
2254        // `windowLog > 14`; at or below that upstream runs the hash-chain
2255        // matcher (`ZSTD_HcFindBestMatch`). We previously hardcoded the Row
2256        // backend for these strategies regardless of window, sending every
2257        // small-window frame (hinted floor = windowLog 14, e.g. the small-4k/10k
2258        // fixtures) through Row where upstream uses HC. Match it: fall back to
2259        // the hash-chain matcher (lazy/greedy parse via `lazy_depth`) when the
2260        // resolved window is <= 14. The HC config is synthesised from the
2261        // level's RowConfig (HC and Row share the same cParams; only the
2262        // matchfinder differs) — `hash_log` / `chain_log` are
2263        // clamped to the (<= 14) window inside the HashChain reset arm, so the
2264        // nominal width here only sets the clamp ceiling.
2265        if params.search == super::strategy::SearchMethod::RowHash && params.window_log <= 14 {
2266            let row = params
2267                .row
2268                .expect("a RowHash level row must carry a RowConfig");
2269            params.search = super::strategy::SearchMethod::HashChain;
2270            // For a dict-bearing frame, downsize the synthesised HC logs to the
2271            // dictionary's content tier via `cdict_table_logs` (the same
2272            // correction the native HC dict-prime path applies above), so a dict
2273            // much smaller than the window doesn't prime a needlessly sparse
2274            // table. Row-finder levels are never BinaryTree, so `uses_bt = false`.
2275            //
2276            // Feed `cdict_table_logs` the UN-hinted base Row width, not the
2277            // resolved `row.hash_bits`: the latter is already source-capped on a
2278            // hinted reset (the `row_cap = table_log + 1` clamp), so passing it
2279            // here would double-cap exactly as the native HC dict path warns
2280            // above — a small hinted source with a large dictionary would
2281            // collapse the prepared table below what the dict needs.
2282            // `cdict_table_logs` only ever downsizes, so deriving the ceiling
2283            // from the un-hinted base (plus active public overrides) keeps the
2284            // dict-tier geometry intact. No source hint => `row.hash_bits` is
2285            // already the level's full width, so reuse it directly.
2286            let row_cdict_hash_bits = match dict_hint.filter(|&size| size > 0) {
2287                Some(_) => {
2288                    let mut base_params = Self::level_params(level, None);
2289                    if let Some(ov) = self.param_overrides
2290                        && !ov.is_empty()
2291                    {
2292                        apply_param_overrides(&mut base_params, &ov);
2293                    }
2294                    base_params
2295                        .row
2296                        .map_or(row.hash_bits, |base_row| base_row.hash_bits)
2297                }
2298                None => row.hash_bits,
2299            };
2300            // Row-backed levels carry only `hash_bits`; the HC chain table they
2301            // fall back to follows the upstream zstd cParams relationship `chainLog =
2302            // hashLog - 1` for every Row level (L6 c18 h19 .. L12 c22 h23, see
2303            // the ROW_L* tables). Synthesise the chain width as `hash_bits - 1`
2304            // so the dict path doesn't leave the chain table one bit too wide
2305            // (cdict_table_logs only downsizes, so passing the full hash width
2306            // for both would keep a 2x-too-large chain table on dict frames).
2307            // Raw `- 1` is underflow-safe: `hash_bits` is either a predefined
2308            // ROW_L* width (>= 19) or a public `hash_log` override, and the
2309            // override is range-validated to `ZSTD_HASHLOG_MIN = 6` at the
2310            // parameter API, so the value is always >= 6 here.
2311            //
2312            // A public `chain_log` override (#27) is dropped by the RowHash
2313            // override arm (Row has no chain table), but once this frame falls
2314            // back to HC the chain table is live and must honour it — mirror
2315            // the native HC dict path, which feeds the override-applied
2316            // `base_hc.chain_log` into `cdict_table_logs`. Use the explicit
2317            // override (also API-validated to ZSTD_CHAINLOG_MIN = 6) when set,
2318            // else the upstream zstd `hashLog - 1` relationship.
2319            let explicit_chain_log = self
2320                .param_overrides
2321                .filter(|ov| !ov.is_empty())
2322                .and_then(|ov| ov.chain_log)
2323                .map(|chain_log| chain_log as usize);
2324            let row_cdict_chain_bits = explicit_chain_log.unwrap_or(row_cdict_hash_bits - 1);
2325            let (mut hash_log, mut chain_log) = match dict_hint.filter(|&size| size > 0) {
2326                Some(dict_size) => cdict_table_logs(
2327                    params.window_log,
2328                    row_cdict_hash_bits,
2329                    row_cdict_chain_bits,
2330                    false,
2331                    dict_size,
2332                ),
2333                None => (
2334                    row.hash_bits,
2335                    explicit_chain_log.unwrap_or(row.hash_bits - 1),
2336                ),
2337            };
2338            // No-dict path: the HashChain reset arm only clamps the logs to the
2339            // window when `hinted`, but a public `window_log` override can lower
2340            // this level to <= 14 with no source hint — clamp the level's full
2341            // Row `hash_bits` to the window here too (upstream zstd `ZSTD_adjustCParams`:
2342            // hashLog <= windowLog + 1, chainLog <= windowLog) so a 16 KiB window
2343            // doesn't allocate Row-sized HC tables.
2344            if dict_hint.filter(|&size| size > 0).is_none() {
2345                let wlog = params.window_log as usize;
2346                hash_log = hash_log.min(wlog + 1);
2347                chain_log = chain_log.min(wlog);
2348            }
2349            params.hc = Some(HcConfig {
2350                hash_log,
2351                chain_log,
2352                search_depth: row.search_depth,
2353                target_len: row.target_len,
2354                search_mls: 4,
2355            });
2356            params.row = None;
2357        }
2358        let next_backend = params.backend();
2359        let max_window_size = 1usize << params.window_log;
2360        self.dictionary_retained_budget = 0;
2361        // Drop any frame-local borrowed staging so it can't leak across a
2362        // reset and misroute the next start/skip into borrowed dispatch.
2363        self.borrowed_pending = None;
2364        if self.active_backend() != next_backend {
2365            // Drain the outgoing backend's allocations into the shared
2366            // pool. The `match &mut self.storage { ... }` block runs to
2367            // completion before the assignment below replaces the
2368            // variant, so the inner state we just drained is dropped
2369            // with the old variant.
2370            match &mut self.storage {
2371                MatcherStorage::Simple(_m) => {
2372                    // FastKernelMatcher owns a flat Vec<u8> history
2373                    // and a Vec<u32> hash table — both drop with the
2374                    // variant assignment below, no per-block buffers
2375                    // to recycle into the driver pools. The
2376                    // assignment-replace path collapses to a noop
2377                    // pre-pass for this backend.
2378                }
2379                MatcherStorage::Dfast(m) => {
2380                    // Drop the long / short hash table allocations
2381                    // before calling `m.reset`. Without this prepass,
2382                    // `DfastMatchGenerator::reset` would `fill` both
2383                    // tables with `DFAST_EMPTY_SLOT` sentinels — wasted
2384                    // work given the next assignment to `self.storage`
2385                    // is about to drop `m` entirely. `reset` itself
2386                    // short-circuits on `if !self.tables.is_empty()`, so
2387                    // handing it an empty `Vec` skips the fill loop.
2388                    // Mirrors the pre-drain pattern in the HashChain
2389                    // arm below (and serves the same peak-memory
2390                    // purpose: release the table-allocation footprint
2391                    // before constructing the replacement variant).
2392                    m.tables = Vec::new();
2393                    m.reset();
2394                }
2395                MatcherStorage::Row(m) => {
2396                    m.row_heads = Vec::new();
2397                    m.row_positions = Vec::new();
2398                    m.row_tags = Vec::new();
2399                    m.reset();
2400                }
2401                MatcherStorage::HashChain(m) => {
2402                    // Release oversized tables when switching away from
2403                    // HashChain so Best's larger allocations don't persist.
2404                    // hash3_table must be released alongside the other
2405                    // two: BtUltra2's `1 << HC3_HASH_LOG` entries would
2406                    // otherwise stay pinned across the backend switch,
2407                    // even though no future caller of this backend will
2408                    // touch them.
2409                    m.table.hash_table = Vec::new();
2410                    m.table.chain_table = Vec::new();
2411                    m.table.hash3_table = Vec::new();
2412                    let vec_pool = &mut self.vec_pool;
2413                    m.reset(|mut data| {
2414                        data.resize(data.capacity(), 0);
2415                        vec_pool.push(data);
2416                    });
2417                }
2418            }
2419            // Swap in a fresh variant for the new backend. The previous
2420            // `storage` is dropped here.
2421            self.storage = match next_backend {
2422                super::strategy::BackendTag::Simple => {
2423                    // Per-level Fast cParams from resolve_level_params:
2424                    // Level(1) gets (hash_log=14, mls=7); Level(-7..=-1)
2425                    // get upstream zstd row-0 (hash_log=13, mls=7); Fastest /
2426                    // Uncompressed keep (hash_log=14, mls=6). See
2427                    // resolve_level_params for rationale.
2428                    let fast = params.fast.expect("Fast level row carries a FastConfig");
2429                    MatcherStorage::Simple(FastKernelMatcher::with_params(
2430                        params.window_log,
2431                        fast.hash_log,
2432                        fast.mls,
2433                        fast.step_size,
2434                    ))
2435                }
2436                super::strategy::BackendTag::Dfast => {
2437                    MatcherStorage::Dfast(DfastMatchGenerator::new(max_window_size))
2438                }
2439                super::strategy::BackendTag::Row => {
2440                    MatcherStorage::Row(RowMatchGenerator::new(max_window_size))
2441                }
2442                super::strategy::BackendTag::HashChain => {
2443                    MatcherStorage::HashChain(HcMatchGenerator::new(max_window_size))
2444                }
2445            };
2446        }
2447
2448        // Single source of truth: `LevelParams::strategy_tag` is the
2449        // authoritative mapping from `CompressionLevel` to strategy.
2450        // `storage.backend()` derives the parse family from the variant,
2451        // so there is no separate runtime tag that could drift against
2452        // `LEVEL_TABLE`.
2453        self.strategy_tag = params.strategy_tag;
2454        self.search = params.search;
2455        self.parse = params.parse();
2456        self.slice_size = self.base_slice_size.min(max_window_size);
2457        self.reported_window_size = max_window_size;
2458        let strategy_tag = self.strategy_tag;
2459        // Source-proportional table window for the backends whose hash-table
2460        // widths are recomputed here (Dfast / Row). Like the HC / Fast caps
2461        // in `adjust_params_for_source_size`, this sizes the internal tables
2462        // from the RAW source log (not the wire `window_log` floor) so a
2463        // small frame zeroes a small table; it never exceeds the real window.
2464        let table_window_size = match hint {
2465            Some(h) => {
2466                let raw_log = source_size_ceil_log(h);
2467                // Clamp the shift below the pointer width before `1usize <<`:
2468                // an oversized hint (>= 2^63 + 1, and on 32-bit usize any hint
2469                // >= 2^32) drives `raw_log` to 64 / >= 32, and the shift would
2470                // overflow (panic in debug, wrap to 0 in release) before the
2471                // `.min(max_window_size)` cap below could bound it. The min cap
2472                // still provides the real semantic window bound.
2473                let shift = raw_log.max(MIN_WINDOW_LOG).min(usize::BITS as u8 - 1);
2474                (1usize << shift).min(max_window_size)
2475            }
2476            None => max_window_size,
2477        };
2478        // The hint-dependent hash-table width the active backend applies, for
2479        // the primed-snapshot key. Dfast/Row compute it from `table_window_size`
2480        // below; HC/Fast leave it `0` because their widths live in `params`
2481        // (`hc.{hash,chain}_log` / `fast_hash_log`) — already part of the key.
2482        let mut resolved_table_bits: usize = 0;
2483        match &mut self.storage {
2484            MatcherStorage::Simple(m) => {
2485                // Per-level Fast cParams threaded from
2486                // resolve_level_params (see Simple-backend swap
2487                // arm above for the (level → params) mapping).
2488                let fast = params.fast.expect("Fast level row carries a FastConfig");
2489                // Same attach/copy split the dict-prime dispatch applies
2490                // below (`prime_with_dictionary`): only attach-mode dict
2491                // frames may keep the main table across the reset via an
2492                // epoch advance — copy-mode and no-dict frames must memset
2493                // it back to bias 0 for the raw-slice kernels.
2494                // `Some(0)` is "no dictionary" (the dict-sizing path above
2495                // filters it the same way): an empty dict primes nothing, so
2496                // an epoch-advance reset would preserve stale attach state
2497                // instead of clearing it.
2498                let dict_attach_epoch = matches!(dict_hint, Some(size) if size > 0)
2499                    && self.reset_dict_attach_ok
2500                    && self
2501                        .reset_size_log
2502                        .is_none_or(|log| log <= FAST_ATTACH_DICT_CUTOFF_LOG);
2503                // Copy-mode dictionary frame whose primed snapshot matches
2504                // this exact resolved shape: `restore_primed_dictionary`
2505                // (called right after this reset; the caller gates the
2506                // restore on the same size bucket and the restore re-checks
2507                // the same key) will `clone_from` the snapshot over this
2508                // matcher, replacing the table contents and bias wholesale —
2509                // the reset's full-table memset would be thrown away. The
2510                // key components mirror `reset_shape` below: Simple leaves
2511                // `resolved_table_bits` 0, never carries an LDM override,
2512                // and `fast_attach` is false in copy mode by construction.
2513                let table_overwritten_by_restore = matches!(dict_hint, Some(size) if size > 0)
2514                    && !dict_attach_epoch
2515                    && self.primed.as_ref().is_some_and(|(_, _, captured)| {
2516                        *captured
2517                            == PrimedKey {
2518                                level,
2519                                params,
2520                                table_bits: 0,
2521                                fast_attach: false,
2522                                ldm: None,
2523                            }
2524                    });
2525                // Cap `hash_log <= window_log + 1` (upstream zstd
2526                // `ZSTD_adjustCParams_internal`): once `window_log` is resized
2527                // down for a small source, a level-default `1 << hash_log`
2528                // table is mostly wasted address space whose per-frame memset
2529                // dominates the compress cost on tiny frames (a 4 KB frame at
2530                // window_log 12 still zero-fills the 64 KiB hash_log-14 table).
2531                // Gated to no-dict frames: the dict-attach path shares one
2532                // hash_log between the main and dict tables (so one hash keys
2533                // both), and shrinking only the main table would break that
2534                // invariant and the small-frame dict ratio.
2535                let hash_log = if dict_hint.is_some_and(|s| s > 0) {
2536                    fast.hash_log
2537                } else {
2538                    fast.hash_log.min(params.window_log as u32 + 1)
2539                };
2540                m.reset(
2541                    params.window_log,
2542                    hash_log,
2543                    fast.mls,
2544                    fast.step_size,
2545                    dict_attach_epoch,
2546                    table_overwritten_by_restore,
2547                );
2548            }
2549            MatcherStorage::Dfast(dfast) => {
2550                dfast.max_window_size = max_window_size;
2551                let dcfg = params
2552                    .dfast
2553                    .expect("Dfast level row must carry a DfastConfig");
2554                // Upstream zstd `cParams.hashLog`/`chainLog`, capped by the
2555                // source-size window when hinted so tiny inputs don't
2556                // over-allocate.
2557                let long_bits = if hinted {
2558                    dfast_hash_bits_for_window(table_window_size).min(dcfg.long_hash_log as usize)
2559                } else {
2560                    dcfg.long_hash_log as usize
2561                };
2562                let short_bits = if hinted {
2563                    dfast_hash_bits_for_window(table_window_size).min(dcfg.short_hash_log as usize)
2564                } else {
2565                    dcfg.short_hash_log as usize
2566                };
2567                resolved_table_bits = long_bits;
2568                dfast.set_hash_bits(long_bits, short_bits);
2569                // Dfast holds no per-block input Vecs (history owns the
2570                // bytes and `add_data` returns each Vec eagerly), so
2571                // `reset` takes no `reuse_space` callback.
2572                dfast.reset();
2573            }
2574            MatcherStorage::Row(row) => {
2575                row.max_window_size = max_window_size;
2576                row.lazy_depth = params.lazy_depth;
2577                let mut row_cfg = params.row.expect("Row level row carries a RowConfig");
2578                if hinted {
2579                    // Clamp the configured hash width by the hinted window
2580                    // (upstream zstd `ZSTD_adjustCParams` caps hashLog by windowLog) —
2581                    // `min`, not replace, so an explicit `hash_log` param
2582                    // override (`row_cfg.hash_bits`) survives the hinted path
2583                    // instead of being overwritten by the window value.
2584                    //
2585                    // Clamp BEFORE `configure` so the backend sees ONE width
2586                    // per frame. Configuring with the unclamped level width
2587                    // and then re-clamping made `row_hash_log` oscillate on
2588                    // every hinted frame, and each width change clears the
2589                    // row tables — `ensure_tables` then re-filled all three
2590                    // every frame in a reused compressor.
2591                    row_cfg.hash_bits = row_cfg
2592                        .hash_bits
2593                        .min(row_hash_bits_for_window(table_window_size));
2594                }
2595                row.configure(row_cfg);
2596                // Key the primed snapshot on the width the backend ACTUALLY
2597                // applied (`set_hash_bits` clamps the request): recording the
2598                // request — or the 0 default on the unhinted path — keys
2599                // identical table geometries apart and forces needless
2600                // dictionary re-primes.
2601                resolved_table_bits = row.hash_bits();
2602                row.reset();
2603            }
2604            MatcherStorage::HashChain(hc) => {
2605                hc.table.max_window_size = max_window_size;
2606                hc.hc.lazy_depth = params.lazy_depth;
2607                let mut hc_cfg = params.hc.expect("HashChain level row carries an HcConfig");
2608                // Cap the hash / chain table logs by the hinted window so a small
2609                // input doesn't allocate the full level's tables (the upstream zstd
2610                // `ZSTD_adjustCParams_internal` clamp: `hashLog <= windowLog + 1`,
2611                // and `cycleLog <= windowLog` — `cycleLog == chainLog` for the HC
2612                // finder, `chainLog - 1` for the BT pair table, so `chainLog <=
2613                // windowLog` (+1 for BT)). Ratio-neutral: a hinted window of
2614                // `2^wlog` bytes holds at most `2^wlog` positions, so the slots
2615                // beyond that are never populated — capping only sheds unused
2616                // allocation. Was the source of L10-lazy peak-alloc ~2.15x the
2617                // upstream zstd on a 1 MiB input. Only applied when hinted; an
2618                // unknown-size stream keeps the full level tables.
2619                // Skip for dict-bearing frames: their `hc_cfg.{hash,chain}_log`
2620                // were already sized to the dictionary content tier via
2621                // `cdict_table_logs` (the dict supplies the long-distance
2622                // matches, so upstream `ZSTD_createCDict` sizes the prepared
2623                // tables to the dict, not the source window). Re-applying the
2624                // source-window cap here would collapse those dict-tier logs
2625                // back to the small hinted source — the same double-cap the
2626                // synthesis sites avoid by using the un-hinted base width.
2627                if hinted && !matches!(dict_hint, Some(size) if size > 0) {
2628                    let wlog = hc_hash_bits_for_window(table_window_size);
2629                    let uses_bt = matches!(
2630                        strategy_tag,
2631                        super::strategy::StrategyTag::Btlazy2
2632                            | super::strategy::StrategyTag::BtOpt
2633                            | super::strategy::StrategyTag::BtUltra
2634                            | super::strategy::StrategyTag::BtUltra2
2635                    );
2636                    hc_cfg.hash_log = hc_cfg.hash_log.min(wlog + 1);
2637                    hc_cfg.chain_log = hc_cfg.chain_log.min(if uses_bt { wlog + 1 } else { wlog });
2638                }
2639                hc.configure(hc_cfg, strategy_tag, params.window_log);
2640                let vec_pool = &mut self.vec_pool;
2641                hc.reset(|mut data| {
2642                    data.resize(data.capacity(), 0);
2643                    vec_pool.push(data);
2644                });
2645                // When the source size is known, pre-size the history mirror to
2646                // the expected total (dictionary + payload) so per-block growth
2647                // does not overshoot via Vec capacity doubling (upstream zstd sizes its
2648                // window buffer exactly). Dominates peak once the match-finder
2649                // tables are dictionary-tier-small. Unhinted streams skip this
2650                // and keep doubling growth.
2651                if let Some(src) = hint {
2652                    // `src` is a u64 hint and may be the u64::MAX "unknown
2653                    // size" sentinel, which truncates under `as usize` on
2654                    // 32-bit targets and overflows when the dict hint is
2655                    // added. Saturate the source size, then saturate the
2656                    // dict-hint addition; `reserve_history` applies the
2657                    // tighter window ceiling to the result.
2658                    let src_hint = usize::try_from(src).unwrap_or(usize::MAX);
2659                    let expected = src_hint.saturating_add(dict_hint.unwrap_or(0));
2660                    hc.table.reserve_history(expected);
2661                }
2662            }
2663        }
2664        // LDM wiring (#27): attach (or clear) the long-distance-match
2665        // producer on the optimal (BT) backend. LDM is the only
2666        // back-reference path that crosses the regular window, so it
2667        // only has a home on the `BtMatcher`; non-BT strategies drop the
2668        // producer. Built AFTER `hc.reset()` because `BtMatcher::reset`
2669        // clears an existing producer's table but does not null the
2670        // slot — installing here gives the new frame a fresh producer.
2671        #[cfg(feature = "hash")]
2672        {
2673            // Resolve the derived LDM params first (immutable borrow of the
2674            // overrides), then reuse the existing producer's allocation below.
2675            let derived_ldm = self
2676                .param_overrides
2677                .as_ref()
2678                .and_then(|ov| ov.ldm)
2679                .map(|ldm_ov| {
2680                    let strategy_ord = ldm_strategy_ordinal(params.strategy_tag, params.lazy_depth);
2681                    // Seed the caller-pinned knobs, then run the upstream zstd
2682                    // derivation over the seed so the remaining (zero)
2683                    // fields are filled with cross-field consistency
2684                    // (e.g. `hash_rate_log = window_log - hash_log`).
2685                    // Clobbering after `adjust_for` would break that and
2686                    // hand the producer an inconsistent set.
2687                    let seed = super::ldm::params::LdmParams {
2688                        window_log: params.window_log as u32,
2689                        hash_log: ldm_ov.hash_log.unwrap_or(0),
2690                        hash_rate_log: ldm_ov.hash_rate_log.unwrap_or(0),
2691                        min_match_length: ldm_ov.min_match.unwrap_or(0),
2692                        bucket_size_log: ldm_ov.bucket_size_log.unwrap_or(0),
2693                    };
2694                    seed.derive(strategy_ord)
2695                });
2696            if let MatcherStorage::HashChain(hc) = &mut self.storage {
2697                // Reuse the existing producer's hash-table allocation when the
2698                // derived params are unchanged: only `clear()` (re-zero the
2699                // table + re-seed the rolling hash, no allocation) is needed for
2700                // the new frame. A params change (or the first frame) forces a
2701                // fresh `LdmProducer::new`. On the reused-encoder compress-dict
2702                // path this avoids re-allocating the LDM hash table (large at
2703                // btultra2) every frame — upstream zstd reuses its `ldmState_t`
2704                // the same way. `clear()` is mandatory here for correctness
2705                // regardless of what `BtMatcher::reset` did to the old table.
2706                let producer = derived_ldm.map(|p| match hc.take_ldm_producer() {
2707                    Some(mut existing) if existing.params() == p => {
2708                        existing.clear();
2709                        existing
2710                    }
2711                    _ => super::ldm::LdmProducer::new(p),
2712                });
2713                hc.set_ldm_producer(producer);
2714            }
2715        }
2716        // Record the resolved matcher shape for the primed-snapshot key. Captured
2717        // here (post-resolution, after the test-only param override) so the key
2718        // reflects exactly the geometry the restored `storage` must match. The
2719        // Fast attach-vs-copy mode is part of the shape ONLY for the Simple
2720        // backend (it decides the distinct dict-table shape that backend builds).
2721        // Dfast/Row/HashChain have their OWN attach/copy regimes, but this bit
2722        // models only the Fast table split; those backends are keyed by the
2723        // resolved matcher geometry instead, so folding the Fast bit into their
2724        // key would over-key identical resolved shapes. When it applies it
2725        // matches the decision `prime_with_dictionary` makes from the same
2726        // `reset_size_log`.
2727        let fast_attach = matches!(next_backend, super::strategy::BackendTag::Simple)
2728            && self.reset_dict_attach_ok
2729            && self
2730                .reset_size_log
2731                .is_none_or(|log| log <= FAST_ATTACH_DICT_CUTOFF_LOG);
2732        // The LDM override is part of the snapshot identity ONLY on the
2733        // optimal (BinaryTree) path: that is the only backend whose cloned
2734        // `storage` carries a `BtMatcher::ldm_producer`. On Fast / Dfast /
2735        // Row and lazy-HashChain resets the producer slot does not exist,
2736        // so folding the override there would over-key the snapshot and
2737        // force needless re-primes when LDM is toggled. Gated like
2738        // `fast_attach` (a key bit only participates where it changes the
2739        // cloned matcher shape).
2740        let active_ldm = if matches!(params.search, super::strategy::SearchMethod::BinaryTree) {
2741            self.param_overrides.and_then(|ov| ov.ldm)
2742        } else {
2743            None
2744        };
2745        self.reset_shape = Some((params, resolved_table_bits, fast_attach, active_ldm));
2746    }
2747
2748    fn dictionary_is_resident(&self) -> bool {
2749        match &self.storage {
2750            MatcherStorage::HashChain(hc) => hc.table.dict_resident,
2751            MatcherStorage::Simple(s) => s.dict_resident(),
2752            MatcherStorage::Dfast(d) => d.dict_resident(),
2753            _ => false,
2754        }
2755    }
2756
2757    fn reapply_resident_dictionary(&mut self, offset_hist: [u32; 3]) {
2758        // Same offset-history head as `prime_with_dictionary`, without the dict
2759        // commit / re-index (resident dict bytes + cached dms already in place).
2760        match self.active_backend() {
2761            super::strategy::BackendTag::Simple => {
2762                self.simple_mut().prime_offset_history(offset_hist)
2763            }
2764            super::strategy::BackendTag::Dfast => {
2765                self.dfast_matcher_mut().offset_hist = offset_hist
2766            }
2767            super::strategy::BackendTag::Row => self.row_matcher_mut().offset_hist = offset_hist,
2768            super::strategy::BackendTag::HashChain => {
2769                let matcher = self.hc_matcher_mut();
2770                matcher.table.offset_hist = offset_hist;
2771                matcher.table.mark_dictionary_primed();
2772            }
2773        }
2774        // Restore the retained-dictionary budget the per-frame `reset` cleared.
2775        // The matcher's `reset` re-inflated `max_window_size` by the resident
2776        // dict region (so the dict + next input both stay in the eviction band),
2777        // exactly as `prime_with_dictionary` does — but the resident path skips
2778        // that prime, so without this the driver-level budget stays 0 and
2779        // `retire_dictionary_budget` never shrinks the inflated window as input
2780        // evicts the dict. For HashChain (whose `window_low` is measured against
2781        // `max_window_size`), a stuck-inflated window would let a post-eviction
2782        // match exceed the frame header's base window and emit an over-window
2783        // offset. The inflation equals `max_window_size - base`, and
2784        // `reported_window_size` is the base `1 << window_log` set by `reset`.
2785        let base = self.reported_window_size;
2786        let inflated = match self.active_backend() {
2787            super::strategy::BackendTag::Simple => self.simple_mut().max_window_size,
2788            super::strategy::BackendTag::Dfast => self.dfast_matcher_mut().max_window_size,
2789            super::strategy::BackendTag::Row => self.row_matcher_mut().max_window_size,
2790            super::strategy::BackendTag::HashChain => self.hc_matcher_mut().table.max_window_size,
2791        };
2792        self.dictionary_retained_budget = inflated.saturating_sub(base);
2793    }
2794
2795    fn prime_with_dictionary(&mut self, dict_content: &[u8], offset_hist: [u32; 3]) {
2796        match self.active_backend() {
2797            super::strategy::BackendTag::Simple => {
2798                // Routes through prime_offset_history so BOTH
2799                // offset_hist (wire encoder) and rep[0..2] (kernel)
2800                // are updated atomically. Without this, the two
2801                // tracks drift after dict priming — kernel emits
2802                // repcode matches against stale FAST_INITIAL_REP
2803                // while the wire encoder uses the primed history,
2804                // producing divergent wire encoding (Copilot review
2805                // #15 on #216).
2806                self.simple_mut().prime_offset_history(offset_hist);
2807            }
2808            super::strategy::BackendTag::Dfast => {
2809                self.dfast_matcher_mut().offset_hist = offset_hist
2810            }
2811            super::strategy::BackendTag::Row => self.row_matcher_mut().offset_hist = offset_hist,
2812            super::strategy::BackendTag::HashChain => {
2813                let matcher = self.hc_matcher_mut();
2814                // Clear the chain/hash tables (deferred from the dict-active
2815                // `reset`): prime rebuilds them from the dict, so they must start
2816                // empty. The reuse hot path skips prime and `clone_from`s a clean
2817                // snapshot instead, so only the first-prime / key-mismatch frames
2818                // pay the fill -- not every reused-CDict frame.
2819                matcher.table.clear_chain_hash_tables();
2820                matcher.table.offset_hist = offset_hist;
2821                matcher.table.mark_dictionary_primed();
2822            }
2823        }
2824
2825        if dict_content.is_empty() {
2826            return;
2827        }
2828
2829        // Dictionary bytes should stay addressable until produced frame output
2830        // itself exceeds the live window size. We bump `max_window_size`
2831        // by the dictionary length so the eviction band keeps the
2832        // primed bytes in `history`.
2833        //
2834        // Cap: `with_params`/`reset` enforce `window_log <= 30` so the
2835        // eviction band `2 * max_window_size` stays below `u32::MAX`
2836        // with headroom for one MAX_BLOCK_SIZE pending block — the
2837        // kernel asserts `data.len() <= u32::MAX`. A large enough
2838        // dictionary could otherwise push `max_window_size` past
2839        // that ceiling via the `saturating_add` below and silently
2840        // re-introduce the same overflow the `window_log` cap was
2841        // designed to prevent. Clamp the post-priming size so the
2842        // doubled-band-plus-block invariant survives.
2843        use super::match_table::storage::MAX_PRIMED_WINDOW_SIZE;
2844
2845        // `requested_dict_budget` is what the caller asked for;
2846        // `base_max_window_size` snapshots the pre-priming cap so we
2847        // can compute how much window the cap actually GRANTED below.
2848        // The cap may clip the requested growth, in which case the
2849        // bookkeeping (`dictionary_retained_budget` retire path) must
2850        // track only the granted portion — otherwise
2851        // `retire_dictionary_budget()` would later reclaim more than
2852        // was actually added and shrink the matcher below its real
2853        // base window (and `cap = 2 * max_window_size` would shrink
2854        // with it, risking under-allocation on subsequent commits).
2855        // The `granted_retained_budget` calculation further below is
2856        // the load-bearing piece — see its block-level comment for
2857        // the post-clip / post-uncommitted-tail math.
2858        let requested_dict_budget = dict_content.len();
2859        let base_max_window_size = match self.active_backend() {
2860            super::strategy::BackendTag::Simple => self.simple_mut().max_window_size,
2861            super::strategy::BackendTag::Dfast => self.dfast_matcher_mut().max_window_size,
2862            super::strategy::BackendTag::Row => self.row_matcher_mut().max_window_size,
2863            super::strategy::BackendTag::HashChain => self.hc_matcher_mut().table.max_window_size,
2864        };
2865        match self.active_backend() {
2866            super::strategy::BackendTag::Simple => {
2867                let matcher = self.simple_mut();
2868                matcher.max_window_size = matcher
2869                    .max_window_size
2870                    .saturating_add(requested_dict_budget)
2871                    .min(MAX_PRIMED_WINDOW_SIZE);
2872            }
2873            super::strategy::BackendTag::Dfast => {
2874                let matcher = self.dfast_matcher_mut();
2875                matcher.max_window_size = matcher
2876                    .max_window_size
2877                    .saturating_add(requested_dict_budget)
2878                    .min(MAX_PRIMED_WINDOW_SIZE);
2879            }
2880            super::strategy::BackendTag::Row => {
2881                let matcher = self.row_matcher_mut();
2882                matcher.max_window_size = matcher
2883                    .max_window_size
2884                    .saturating_add(requested_dict_budget)
2885                    .min(MAX_PRIMED_WINDOW_SIZE);
2886            }
2887            super::strategy::BackendTag::HashChain => {
2888                let matcher = self.hc_matcher_mut();
2889                matcher.table.max_window_size = matcher
2890                    .table
2891                    .max_window_size
2892                    .saturating_add(requested_dict_budget)
2893                    .min(MAX_PRIMED_WINDOW_SIZE);
2894            }
2895        }
2896
2897        let mut start = 0usize;
2898        let mut committed_dict_budget = 0usize;
2899        // insert_position needs 4 bytes of lookahead for hashing;
2900        // backfill_boundary_positions re-visits tail positions once the
2901        // next slice extends history, but cannot hash <4 byte fragments.
2902        let min_primed_tail = match self.active_backend() {
2903            super::strategy::BackendTag::Simple => MIN_MATCH_LEN,
2904            super::strategy::BackendTag::Dfast
2905            | super::strategy::BackendTag::Row
2906            | super::strategy::BackendTag::HashChain => 4,
2907        };
2908        while start < dict_content.len() {
2909            let end = (start + self.slice_size).min(dict_content.len());
2910            if end - start < min_primed_tail {
2911                break;
2912            }
2913            // Stage the dict chunk WITHOUT `get_next_space`'s
2914            // `resize(slice_size, 0)` zero-fill: that memsets a full
2915            // block-sized buffer (up to ~128 KiB) every frame only to have it
2916            // `clear()`-ed and overwritten by the dict bytes on the very next
2917            // lines — pure waste (measured ~10% of the small dict encode).
2918            // Reuse a pooled buffer's capacity if one is free (the prime/skip
2919            // cycle recycles them back), else allocate exactly the chunk.
2920            // Mirrors upstream zstd, which references the CDict content rather
2921            // than zero-filling a fresh window per frame.
2922            let mut space = self.vec_pool.pop().unwrap_or_default();
2923            space.clear();
2924            space.extend_from_slice(&dict_content[start..end]);
2925            self.commit_space(space);
2926            self.skip_matching_for_dictionary_priming();
2927            committed_dict_budget += end - start;
2928            start = end;
2929        }
2930
2931        // Derive `granted_retained_budget` directly from the two real
2932        // bounds — bytes actually committed and bytes the cap allows
2933        // — instead of doing a cap-clip pass followed by an
2934        // uncommitted-tail subtract. Previous shape double-discounted
2935        // when the cap clipped: clip lost `(requested - allowed)`,
2936        // then tail-subtract lost ANOTHER `(requested - committed)`,
2937        // leaving `max_window_size` shy of the dictionary that was
2938        // actually retained (e.g. cap=900, committed=998, uncommitted=2
2939        // landed at granted=898 instead of the correct 900).
2940        let capped_retained_budget = MAX_PRIMED_WINDOW_SIZE.saturating_sub(base_max_window_size);
2941        let granted_retained_budget = committed_dict_budget.min(capped_retained_budget);
2942        let final_max_window_size = base_max_window_size.saturating_add(granted_retained_budget);
2943        match self.active_backend() {
2944            super::strategy::BackendTag::Simple => {
2945                self.simple_mut().max_window_size = final_max_window_size;
2946            }
2947            super::strategy::BackendTag::Dfast => {
2948                self.dfast_matcher_mut().max_window_size = final_max_window_size;
2949            }
2950            super::strategy::BackendTag::Row => {
2951                self.row_matcher_mut().max_window_size = final_max_window_size;
2952            }
2953            super::strategy::BackendTag::HashChain => {
2954                self.hc_matcher_mut().table.max_window_size = final_max_window_size;
2955            }
2956        }
2957        if granted_retained_budget > 0 {
2958            self.dictionary_retained_budget = self
2959                .dictionary_retained_budget
2960                .saturating_add(granted_retained_budget);
2961        }
2962        if self.active_backend() == super::strategy::BackendTag::HashChain {
2963            // Recompute the lazy-HC attach decision made per-chunk in
2964            // `skip_matching_for_dictionary_priming` (stable across the prime —
2965            // `reset_size_log` does not change here).
2966            //
2967            // The HC attach/copy mode is deliberately NOT folded into `PrimedKey`
2968            // (unlike Fast `fast_attach`). Fast attach builds a separate dict
2969            // table whose dimensions differ from the copy-mode live table, so a
2970            // cross-mode restore would install mismatched table geometry and the
2971            // encoder could search past the frame window (undecodable). The two
2972            // HC modes share identical window geometry: `max_window_size` and the
2973            // dictionary limit are both set ABOVE this branch (the same value in
2974            // either mode), and the live chain table dimensions come from the
2975            // resolved `params` the key already pins. The modes differ only in
2976            // WHERE the committed dict lives — a single-link `dms` (attach) vs
2977            // merged into the live chain (copy) — both producing valid matches at
2978            // in-window offsets. Upstream zstd makes the same observation: attach
2979            // (`ZSTD_resetCCtx_byAttachingCDict`) and copy
2980            // (`ZSTD_resetCCtx_byCopyingCDict`) both keep the caller's
2981            // `windowLog`; the choice is a memory/speed trade-off, not a wire
2982            // contract. So restoring an attach snapshot where this frame would
2983            // have copied (or vice versa) yields a decodable frame that may only
2984            // differ in which matches are found (ratio) — algorithmic freedom, not
2985            // a defect. Keying on the mode would instead force a re-prime across
2986            // the cutoff, re-adding the per-frame cost this snapshot path removes.
2987            //
2988            // In practice the public reuse path (`compress_independent_frame`)
2989            // only ever captures AND restores the COPY-mode snapshot — capture is
2990            // gated on the above-cutoff source size, so a restored frame always
2991            // matches the captured mode. `hc_dict_snapshot_reuse_roundtrips` pins
2992            // that same-mode reuse decodes; the driver-level cross-mode restore is
2993            // accepted (not refused) per
2994            // `primed_snapshot_fast_attach_does_not_over_key_non_simple_backends`.
2995            let attach = self.hc_dict_attach_mode();
2996            let table = &mut self.hc_matcher_mut().table;
2997            table.set_dictionary_limit_from_primed_bytes(committed_dict_budget);
2998            // Build the dictMatchState over the committed dict (front of history)
2999            // so `find_best_match` dual-probes it with its own compare budget —
3000            // but ONLY in ATTACH mode. BT/optimal attach → DUBT dms; lazy-HC
3001            // attach → single-link hash-chain dms. COPY mode (large known source,
3002            // both BT and lazy-HC) already merged the dict into the live tree /
3003            // chain in `skip_matching_for_dictionary_priming`, so it carries no
3004            // separate dms — drop any stale one.
3005            if !attach {
3006                table.dms.invalidate();
3007            } else if table.uses_bt {
3008                table.prime_dms_bt(committed_dict_budget);
3009            } else {
3010                table.prime_dms_hc(committed_dict_budget);
3011            }
3012        }
3013        // CDict-equivalent: now that every dict chunk is indexed, mark the
3014        // Fast-backend dict table primed so the next frame's re-prime reuses
3015        // it (skips the re-hash) while still re-committing the dict bytes to
3016        // history. No-op when the attach path built no table (copy mode or a
3017        // sub-8-byte dict) — `mark_dict_primed` self-guards on table presence.
3018        match self.active_backend() {
3019            super::strategy::BackendTag::Simple => self.simple_mut().mark_dict_primed(),
3020            super::strategy::BackendTag::Dfast => self.dfast_matcher_mut().mark_dict_primed(),
3021            super::strategy::BackendTag::Row => self.row_matcher_mut().mark_dict_primed(),
3022            _ => {}
3023        }
3024    }
3025
3026    fn restore_primed_dictionary(&mut self, level: super::CompressionLevel) -> bool {
3027        // Only the (storage, dictionary_retained_budget) pair is what
3028        // `prime_with_dictionary` writes; restoring them reproduces the
3029        // post-prime state exactly. Gated on the FULL resolved key (level + the
3030        // resolved `LevelParams` + the active backend's table width), not just
3031        // the level: `reset` resolves the hint into a window/table geometry, so a
3032        // same-level snapshot taken at a hint that resolved to a different shape
3033        // carries a `storage.max_window_size` / table dimensions that no longer
3034        // match this reset. Restoring it would let the encoder search past the
3035        // frame header's window (an undecodable match), so on a key mismatch we
3036        // refuse and the caller re-primes.
3037        let Some((params, table_bits, fast_attach, ldm)) = self.reset_shape else {
3038            return false;
3039        };
3040        let key = PrimedKey {
3041            level,
3042            params,
3043            table_bits,
3044            fast_attach,
3045            ldm,
3046        };
3047        let Some((snapshot, budget, captured_key)) = &self.primed else {
3048            return false;
3049        };
3050        if *captured_key != key {
3051            return false;
3052        }
3053        let budget = *budget;
3054        match (&mut self.storage, snapshot) {
3055            // Same-variant Fast restore: copy the snapshot into the retained
3056            // live storage. `clone_from` reuses the history / hash-table /
3057            // dict-table buffers, so this is the upstream zstd CDict table-copy
3058            // regime's cost (pure copies) instead of a full per-frame
3059            // allocation + copy + drop cycle.
3060            (MatcherStorage::Simple(live), MatcherStorage::Simple(snap)) => {
3061                live.clone_from(snap);
3062            }
3063            // Same-variant HC lazy/greedy restore (non-BT): the snapshot keeps
3064            // the full primed hash/chain tables (capture's non-BT full clone),
3065            // so `clone_from` reuses the live history/hash/chain/dms buffers in
3066            // place — upstream zstd reuses the CDict tables rather than reallocating
3067            // them. This is the per-frame allocate+copy+drop that dominated
3068            // small `compress-dict` HC frames (5-7x vs C). BT (`uses_bt`)
3069            // snapshots drop their live tables, so they stay on the realloc
3070            // path below.
3071            (MatcherStorage::HashChain(live), MatcherStorage::HashChain(snap))
3072                if !snap.table.uses_bt =>
3073            {
3074                live.table.clone_from(&snap.table);
3075                live.hc.clone_from(&snap.hc);
3076                live.strategy_tag = snap.strategy_tag;
3077                // backend is `HcBackend::Hc` (zero-sized) for non-BT levels;
3078                // the live one is already correct for this resolved key.
3079            }
3080            (live, snapshot_storage) => {
3081                let mut storage = snapshot_storage.clone();
3082                // This arm handles the binary-tree backend. In ATTACH mode the
3083                // snapshot was stored WITHOUT its live hash / chain / hash3
3084                // tables (they hold no dictionary entries — the dict lives in
3085                // `dms` + history; see `capture_primed_dictionary`), so
3086                // `ensure_tables` re-allocates them zeroed to the snapshot's
3087                // geometry, exactly reproducing the post-prime state (all
3088                // `HC_EMPTY`). In COPY mode the snapshot retained its FULL live
3089                // tree (the dict was merged into it, no `dms`), so the tables are
3090                // already present at the right length and `ensure_tables` — which
3091                // only allocates on a length mismatch — leaves them untouched.
3092                // Either way this is a full storage replace, so no stale
3093                // live-table entry from a prior frame can survive.
3094                if let MatcherStorage::HashChain(hc) = &mut storage {
3095                    hc.table.ensure_tables();
3096                }
3097                // The snapshot does not retain the LDM producer (it holds no
3098                // dict state; see `capture_primed_dictionary`). Carry over the
3099                // frame's freshly-reset producer — built this frame by `reset`
3100                // with the same params the snapshot key pins, and empty (no
3101                // input processed yet), so it is equivalent to the producer
3102                // the snapshot was captured with.
3103                #[cfg(feature = "hash")]
3104                {
3105                    let fresh_ldm = if let MatcherStorage::HashChain(hc) = live {
3106                        hc.take_ldm_producer()
3107                    } else {
3108                        None
3109                    };
3110                    if let MatcherStorage::HashChain(hc) = &mut storage {
3111                        hc.set_ldm_producer(fresh_ldm);
3112                    }
3113                }
3114                *live = storage;
3115            }
3116        }
3117        self.dictionary_retained_budget = budget;
3118        true
3119    }
3120
3121    fn capture_primed_dictionary(&mut self, level: super::CompressionLevel) {
3122        // No resolved shape means `reset` has not run for this frame — nothing
3123        // valid to key a snapshot on, so skip the capture.
3124        let Some((params, table_bits, fast_attach, ldm)) = self.reset_shape else {
3125            return;
3126        };
3127        let key = PrimedKey {
3128            level,
3129            params,
3130            table_bits,
3131            fast_attach,
3132            ldm,
3133        };
3134        // CDict-equivalent retained state. A binary-tree level in ATTACH mode
3135        // decouples the dictionary into `dms` (the upstream zstd `dictMatchState`); its
3136        // live hash / chain / hash3 tables carry NO dict entries
3137        // (`skip_matching_dict_bt` keeps the dict out of the live tree), so they
3138        // are pure zeros. Storing them in the snapshot wastes the full table
3139        // footprint (a second window-tier table set resident for the whole
3140        // compress). Instead, move the live tables OUT of the working storage,
3141        // clone only the dict-state (history + `dms` + window/offset/dict-limit),
3142        // then move the live tables back — the snapshot keeps just what upstream zstd's
3143        // CDict keeps, and `restore_primed_dictionary` re-allocates the zeroed
3144        // live tables. Every other case keeps the dict reachable through the live
3145        // structure, so the snapshot must retain the full tables (full clone):
3146        // lazy-HC attach (it DOES prime a hash-chain `dms`, but the live chain is
3147        // still the search structure, so the tables must travel) and COPY mode for
3148        // BOTH BT and lazy-HC (`dms` invalidated, dict merged into the live tree /
3149        // chain). `uses_bt && dms.is_primed()` is therefore the exact "decoupled"
3150        // signal — true only for the BT attach prime; lazy-HC attach primes `dms`
3151        // too but is intentionally NOT decoupled.
3152        let bt_decoupled = matches!(
3153            &self.storage,
3154            MatcherStorage::HashChain(hc) if hc.table.uses_bt && hc.table.dms.is_primed()
3155        );
3156        if bt_decoupled {
3157            let MatcherStorage::HashChain(hc) = &mut self.storage else {
3158                unreachable!("bt_decoupled implies HashChain storage");
3159            };
3160            let hash_table = core::mem::take(&mut hc.table.hash_table);
3161            let chain_table = core::mem::take(&mut hc.table.chain_table);
3162            let hash3_table = core::mem::take(&mut hc.table.hash3_table);
3163            // The LDM producer carries no dictionary state (LDM is not
3164            // dict-primed; its hash table is empty at capture), so it is not
3165            // retained either — `restore` reinstates the frame's freshly-reset
3166            // producer. Take it out so the clone does not duplicate its table.
3167            #[cfg(feature = "hash")]
3168            let ldm_producer = hc.take_ldm_producer();
3169            // Clone the dict-state-only storage (live tables now empty Vecs,
3170            // LDM producer detached).
3171            let snapshot = self.storage.clone();
3172            // Move the live tables (and LDM producer) back into the working storage.
3173            let MatcherStorage::HashChain(hc) = &mut self.storage else {
3174                unreachable!("storage variant is stable across the take/put");
3175            };
3176            hc.table.hash_table = hash_table;
3177            hc.table.chain_table = chain_table;
3178            hc.table.hash3_table = hash3_table;
3179            #[cfg(feature = "hash")]
3180            hc.set_ldm_producer(ldm_producer);
3181            self.primed = Some((snapshot, self.dictionary_retained_budget, key));
3182        } else {
3183            self.primed = Some((self.storage.clone(), self.dictionary_retained_budget, key));
3184        }
3185    }
3186
3187    fn invalidate_primed_dictionary(&mut self) {
3188        self.primed = None;
3189        // Drop the Fast-backend CDict-equivalent table cache too: it is keyed
3190        // to the dictionary being removed / replaced. Left in place, the next
3191        // same-params `reset` would retain it and the kernel would probe a
3192        // dict region whose bytes are no longer re-committed to history.
3193        match self.active_backend() {
3194            super::strategy::BackendTag::Simple => self.simple_mut().invalidate_dict_cache(),
3195            super::strategy::BackendTag::Dfast => self.dfast_matcher_mut().invalidate_dict_cache(),
3196            // Row keeps its attach index across frames (like Simple/Dfast),
3197            // so a dictionary swap must drop its cached dict rows too;
3198            // otherwise the next small/unknown-size frame reuses stale
3199            // attach state through `prime_dict_attach_current_block`.
3200            super::strategy::BackendTag::Row => self.row_matcher_mut().invalidate_dict_cache(),
3201            // The BT dms tree is keyed to the dict bytes; `prime_dms_bt`
3202            // skips the rebuild while its shape matches, so a swapped
3203            // dictionary of the same length would otherwise keep serving the
3204            // OLD dictionary's tree.
3205            super::strategy::BackendTag::HashChain => {
3206                let table = &mut self.hc_matcher_mut().table;
3207                table.dms.invalidate();
3208                // Deactivate the dictionary state so the next `reset` does not
3209                // take the dict-active defer-the-table-clear branch. That branch
3210                // rewinds the tables to the origin and hands the clear off to a
3211                // following `prime_with_dictionary` / `restore_primed_dictionary`.
3212                // After a dictionary is removed (or replaced), the very next
3213                // frame may carry no dictionary, in which case neither hand-off
3214                // runs and the deferred clear would never execute — leaving stale
3215                // dict-region entries at the rewound base. Clearing the flag
3216                // routes that reset down the no-dictionary path instead; a
3217                // replacement dictionary re-arms the flag when it re-primes.
3218                table.dictionary_active = false;
3219            }
3220        }
3221    }
3222
3223    fn seed_dictionary_entropy(
3224        &mut self,
3225        huff: Option<&crate::huff0::huff0_encoder::HuffmanTable>,
3226        ll: Option<&crate::fse::fse_encoder::FSETable>,
3227        ml: Option<&crate::fse::fse_encoder::FSETable>,
3228        of: Option<&crate::fse::fse_encoder::FSETable>,
3229    ) {
3230        if self.active_backend() == super::strategy::BackendTag::HashChain {
3231            self.hc_matcher_mut()
3232                .seed_dictionary_entropy(huff, ll, ml, of);
3233        }
3234    }
3235
3236    fn window_size(&self) -> u64 {
3237        self.reported_window_size as u64
3238    }
3239
3240    fn get_next_space(&mut self) -> Vec<u8> {
3241        if let Some(mut space) = self.vec_pool.pop() {
3242            if space.len() > self.slice_size {
3243                space.truncate(self.slice_size);
3244            }
3245            if space.len() < self.slice_size {
3246                space.resize(self.slice_size, 0);
3247            }
3248            return space;
3249        }
3250        alloc::vec![0; self.slice_size]
3251    }
3252
3253    fn get_last_space(&mut self) -> &[u8] {
3254        match &self.storage {
3255            MatcherStorage::Simple(m) => m.last_committed_space(),
3256            MatcherStorage::Dfast(m) => m.get_last_space(),
3257            MatcherStorage::Row(m) => m.get_last_space(),
3258            MatcherStorage::HashChain(m) => m.table.get_last_space(),
3259        }
3260    }
3261
3262    fn commit_space(&mut self, space: Vec<u8>) {
3263        let mut evicted_bytes = 0usize;
3264        // Split borrows manually so the `add_data` closures can write
3265        // into `vec_pool` while the backend itself holds an exclusive
3266        // borrow via `storage`. (Suffix-store recycling went away
3267        // with the legacy `MatchGenerator`; the FastKernelMatcher
3268        // arm below has no pool interaction.)
3269        let vec_pool = &mut self.vec_pool;
3270        match &mut self.storage {
3271            MatcherStorage::Simple(m) => {
3272                // FastKernelMatcher owns its history as a single
3273                // flat Vec<u8> and the hash table as a Vec<u32> —
3274                // neither recycles into the driver-side pools. The
3275                // eager pre-commit eviction inside
3276                // `FastKernelMatcher::accept_data` drops bytes when
3277                // accepting this block would push history past 2×
3278                // max_window_size; that delta is what feeds
3279                // `evicted_bytes` here via the `pre / post`
3280                // history-length comparison.
3281                let pre = m.history_len_for_eviction_accounting();
3282                m.accept_data(space);
3283                let post = m.history_len_for_eviction_accounting();
3284                // `accept_data` performs eager pre-commit window
3285                // eviction (so this `pre - post` delta correctly
3286                // feeds the dictionary-budget retire flow). See
3287                // `FastKernelMatcher::accept_data` for the
3288                // commit-time-visibility rationale (closes #216
3289                // CodeRabbit review #5 / Copilot review #1: without
3290                // eager eviction, the delta was always 0 and the
3291                // dict budget never retired, leaving max_window_size
3292                // inflated post-dict-prime → matcher could emit
3293                // offsets exceeding the frame header's window).
3294                evicted_bytes += pre.saturating_sub(post);
3295            }
3296            MatcherStorage::Dfast(m) => {
3297                // Dfast's `add_data` callback receives the INPUT
3298                // `Vec<u8>` for pool recycling (Dfast stores its
3299                // bytes in the contiguous `history` buffer, not in
3300                // per-block Vecs — there is no per-block buffer to
3301                // pop off and hand back). Counting `data.len()` as
3302                // evicted bytes would conflate "new bytes ingested"
3303                // with "old bytes evicted from window"; the two
3304                // happen to coincide when the previous window was
3305                // saturated and the new input fills it 1:1, but
3306                // diverge when the eviction pop-loop drops blocks
3307                // of a different size than the incoming input. The
3308                // `dictionary_retained_budget` retire decision
3309                // downstream then gets driven by inflated eviction
3310                // counts and shrinks `max_window_size` prematurely.
3311                //
3312                // Derive the real eviction delta from `window_size`
3313                // before/after the call. The pop loop inside
3314                // `add_data` decrements `window_size` by each
3315                // evicted block length and then the final
3316                // `extend_from_slice + push_back` adds `space_len`,
3317                // so `evicted = pre + space_len - post`.
3318                let pre = m.window_size;
3319                let space_len = space.len();
3320                m.add_data(space, |data| {
3321                    // Same per-block recycle as the HashChain arm: push
3322                    // the spent input buffer back as-is rather than
3323                    // zero-filling to capacity. `add_data` mirrors the
3324                    // bytes into `history` and calls this every block, so
3325                    // capacity-wide zeroing would be hot-path waste;
3326                    // `get_next_space` zeroes at most `slice_size` bytes
3327                    // when it later reuses the buffer.
3328                    vec_pool.push(data);
3329                });
3330                // Plain `+` (the `saturating_sub` floors at 0): `pre` + one
3331                // block are byte counts bounded by the window, no overflow.
3332                evicted_bytes += (pre + space_len).saturating_sub(m.window_size);
3333            }
3334            MatcherStorage::Row(m) => {
3335                // RowMatchGenerator::add_data recycles the *input* buffer
3336                // through this callback every commit (its bytes are mirrored
3337                // into `history`), not the evicted chunks. Derive the eviction
3338                // delta from `window_size` before/after — `evicted = pre +
3339                // space_len - post` — exactly like the Simple / HashChain arms.
3340                // Counting the callback argument as evicted would charge the
3341                // whole committed block as evicted and prematurely retire
3342                // dictionary budget on a window that evicts nothing.
3343                let pre = m.window_size;
3344                let space_len = space.len();
3345                m.add_data(space, |data| {
3346                    // Recycle the spent buffer as-is; `add_data` runs this for
3347                    // every committed block, so zero-filling to capacity here
3348                    // would be hot-path waste (`get_next_space` zeroes at most
3349                    // `slice_size` on reuse).
3350                    vec_pool.push(data);
3351                });
3352                // Plain `+` (the `saturating_sub` floors at 0): `pre` + one
3353                // block are byte counts bounded by the window, no overflow.
3354                evicted_bytes += (pre + space_len).saturating_sub(m.window_size);
3355            }
3356            MatcherStorage::HashChain(m) => {
3357                // MatchTable::add_data now recycles the *incoming* buffer
3358                // through `reuse_space` (its bytes are copied into the
3359                // contiguous `history` mirror), so the callback no longer
3360                // reports evicted chunks. Derive the eviction delta from
3361                // `window_size` before/after, exactly like the Simple arm:
3362                // `evicted = pre + space_len - post`.
3363                let pre = m.table.window_size;
3364                let space_len = space.len();
3365                m.table.add_data(space, |data| {
3366                    // Recycle the spent input buffer to the pool as-is.
3367                    // `add_data` runs this callback for every committed
3368                    // block (the bytes are mirrored into `history`), so
3369                    // growing the buffer to its full capacity here would
3370                    // zero the whole allocation on the hot path.
3371                    // `get_next_space` resizes a popped buffer to
3372                    // `slice_size` on demand, touching at most
3373                    // `slice_size` bytes — never the larger capacity the
3374                    // pool retains.
3375                    vec_pool.push(data);
3376                });
3377                // Plain `+` (the `saturating_sub` floors at 0): byte counts
3378                // bounded by the window, no overflow.
3379                evicted_bytes += (pre + space_len).saturating_sub(m.table.window_size);
3380            }
3381        }
3382        // Gate the second backend trim pass on actual budget
3383        // reclamation. Without it, every slice commit on the
3384        // no-dictionary / no-eviction path (the common case) would
3385        // run a backend `match` ladder + `trim_to_window` early-out
3386        // for no reason — `trim_after_budget_retire` only does
3387        // meaningful work when `retire_dictionary_budget` shrank
3388        // `max_window_size` enough to make the backend's
3389        // `window_size > max_window_size` invariant trigger
3390        // eviction.
3391        if self.retire_dictionary_budget(evicted_bytes) {
3392            self.trim_after_budget_retire();
3393        }
3394    }
3395
3396    fn start_matching(&mut self, mut handle_sequence: impl for<'a> FnMut(Sequence<'a>)) {
3397        use super::strategy::{self, StrategyTag};
3398        // Borrowed one-shot Fast path: if the frame driver staged a
3399        // block range via `set_borrowed_block`, scan it in place against
3400        // the borrowed window instead of the owned committed block. Only
3401        // the Simple backend is instrumented (the gate guarantees it),
3402        // and the stage is consumed so the next block re-stages.
3403        if let Some((block_start, block_end)) = self.borrowed_pending.take() {
3404            match self.active_backend() {
3405                super::strategy::BackendTag::Simple => {
3406                    let m = self.simple_mut();
3407                    if m.dict_is_attached() {
3408                        // Dict-attach borrowed scan: live matches read the
3409                        // borrowed input in place, dict matches read the
3410                        // committed dict prefix via the 2-segment counter.
3411                        m.start_matching_borrowed_dict(
3412                            block_start,
3413                            block_end,
3414                            &mut handle_sequence,
3415                        );
3416                    } else {
3417                        m.start_matching_borrowed(block_start, block_end, &mut handle_sequence);
3418                    }
3419                }
3420                super::strategy::BackendTag::Dfast => self
3421                    .dfast_matcher_mut()
3422                    .start_matching_borrowed(block_start, block_end, &mut handle_sequence),
3423                super::strategy::BackendTag::Row => {
3424                    // Same greedy/lazy parse split as the owned RowHash arm.
3425                    let greedy = self.parse == super::strategy::ParseMode::Greedy;
3426                    self.row_matcher_mut().start_matching_borrowed(
3427                        block_start,
3428                        block_end,
3429                        greedy,
3430                        &mut handle_sequence,
3431                    );
3432                }
3433                super::strategy::BackendTag::HashChain => match self.search {
3434                    super::strategy::SearchMethod::HashChain => self
3435                        .hc_matcher_mut()
3436                        .start_matching_lazy_borrowed(block_start, block_end, &mut handle_sequence),
3437                    super::strategy::SearchMethod::BinaryTree => {
3438                        // Run the SAME BT dispatch as the owned BinaryTree arm
3439                        // below — every BT body reads its range via
3440                        // current_block_range() and bytes via live_history()
3441                        // (borrowed-aware), so the staged block is scanned in
3442                        // place. The table was already staged by
3443                        // `set_borrowed_block` (the HashChain arm at the top of
3444                        // this file calls `table.stage_borrowed_block` with the
3445                        // same range, and `borrowed_pending` is set only there),
3446                        // so no re-stage is needed here.
3447                        // Only btlazy2 reaches the borrowed BinaryTree scan:
3448                        // `borrowed_supported()` keeps the optimal parsers
3449                        // (BtOpt/BtUltra/BtUltra2) on the owned path, and
3450                        // `set_borrowed_block` asserts that predicate before any
3451                        // range is staged, so an optimal strategy_tag can never
3452                        // arrive here.
3453                        match self.strategy_tag {
3454                            StrategyTag::Btlazy2 => self
3455                                .hc_matcher_mut()
3456                                .start_matching_btlazy2(&mut handle_sequence),
3457                            other => unreachable!(
3458                                "borrowed BinaryTree scan is only supported for Btlazy2, got {other:?}"
3459                            ),
3460                        }
3461                    }
3462                    other => {
3463                        unreachable!("HashChain backend with unexpected search {other:?}")
3464                    }
3465                },
3466            }
3467            return;
3468        }
3469        // Decoupled parse×search dispatch (fires once per block). The
3470        // search axis (`self.search`) picks the candidate-finding backend;
3471        // the parse axis (greedy vs lazy depth) is carried by the
3472        // backend's runtime `lazy_depth`, set per level at `reset()`.
3473        // The two are independent, so any parse can run on any search
3474        // backend. The `BinaryTree` arm still selects the opt `Strategy`
3475        // ZST off `strategy_tag` so `compress_block::<S>` keeps its
3476        // const-folded optimal-parser monomorphisation.
3477        use super::strategy::SearchMethod;
3478        match self.search {
3479            SearchMethod::Fast => {
3480                self.simple_mut().start_matching(&mut handle_sequence);
3481                self.recycle_simple_space();
3482            }
3483            SearchMethod::DoubleFast => {
3484                self.dfast_matcher_mut()
3485                    .start_matching(&mut handle_sequence);
3486            }
3487            SearchMethod::RowHash => {
3488                // Greedy parse (depth 0) = upstream zstd-greedy entry (default
3489                // `ip + 1` start, greedy repcode commit); lazy / lazy2 use
3490                // the `pick_lazy_match` lookahead entry (reads `lazy_depth`).
3491                // Both bare entries dispatch on `row_log` internally into the
3492                // const-`ROW_LOG` hot loop (upstream zstd per-rowLog variant table).
3493                let greedy = self.parse == super::strategy::ParseMode::Greedy;
3494                let row = self.row_matcher_mut();
3495                if greedy {
3496                    row.start_matching_greedy(&mut handle_sequence);
3497                } else {
3498                    row.start_matching(&mut handle_sequence);
3499                }
3500            }
3501            SearchMethod::HashChain => {
3502                // Greedy/lazy/lazy2 all flow through the lazy parser; it
3503                // reads `hc.lazy_depth` (0 = greedy commit).
3504                self.hc_matcher_mut()
3505                    .start_matching_lazy(&mut handle_sequence);
3506            }
3507            SearchMethod::BinaryTree => match self.strategy_tag {
3508                StrategyTag::Btlazy2 => self
3509                    .hc_matcher_mut()
3510                    .start_matching_btlazy2(&mut handle_sequence),
3511                StrategyTag::BtOpt => self.compress_block::<strategy::BtOpt>(&mut handle_sequence),
3512                StrategyTag::BtUltra => {
3513                    self.compress_block::<strategy::BtUltra>(&mut handle_sequence)
3514                }
3515                StrategyTag::BtUltra2 => {
3516                    self.compress_block::<strategy::BtUltra2>(&mut handle_sequence)
3517                }
3518                _ => unreachable!(
3519                    "SearchMethod::BinaryTree requires a BT strategy tag (Btlazy2/BtOpt/BtUltra/BtUltra2)"
3520                ),
3521            },
3522        }
3523    }
3524
3525    fn skip_matching(&mut self) {
3526        self.skip_matching_with_hint(None);
3527    }
3528
3529    fn skip_matching_with_hint(&mut self, incompressible_hint: Option<bool>) {
3530        // Borrowed one-shot Fast path: a staged block range routes to the
3531        // borrowed skip (records the range for `get_last_space`, primes
3532        // hashes on the dict-priming hint) with no owned-history append
3533        // and nothing to recycle. Stage is consumed.
3534        if let Some((block_start, block_end)) = self.borrowed_pending.take() {
3535            match self.active_backend() {
3536                super::strategy::BackendTag::Simple => self.simple_mut().skip_matching_borrowed(
3537                    block_start,
3538                    block_end,
3539                    incompressible_hint,
3540                ),
3541                super::strategy::BackendTag::Dfast => self
3542                    .dfast_matcher_mut()
3543                    .skip_matching_borrowed(block_start, block_end, incompressible_hint),
3544                super::strategy::BackendTag::Row => self.row_matcher_mut().skip_matching_borrowed(
3545                    block_start,
3546                    block_end,
3547                    incompressible_hint,
3548                ),
3549                super::strategy::BackendTag::HashChain => self
3550                    .hc_matcher_mut()
3551                    .skip_matching_borrowed(block_start, block_end, incompressible_hint),
3552            }
3553            return;
3554        }
3555        match self.active_backend() {
3556            super::strategy::BackendTag::Simple => {
3557                self.simple_mut()
3558                    .skip_matching_with_hint(incompressible_hint);
3559                self.recycle_simple_space();
3560            }
3561            super::strategy::BackendTag::Dfast => {
3562                self.dfast_matcher_mut().skip_matching(incompressible_hint)
3563            }
3564            super::strategy::BackendTag::Row => self
3565                .row_matcher_mut()
3566                .skip_matching_with_hint(incompressible_hint),
3567            super::strategy::BackendTag::HashChain => {
3568                self.hc_matcher_mut().skip_matching(incompressible_hint)
3569            }
3570        }
3571    }
3572}
3573
3574impl MatchGeneratorDriver {
3575    /// Monomorphised optimal-parser entry point. Only the `BinaryTree`
3576    /// search arm of [`Matcher::start_matching`] routes here, selecting
3577    /// the concrete opt `S: Strategy` (BtOpt / BtUltra / BtUltra2) off
3578    /// `strategy_tag`, so the optimiser keeps the cost-model predicates
3579    /// (`S::USE_BT` / `S::USE_HASH3` / `S::ACCURATE_PRICE` /
3580    /// `S::TWO_PASS_SEED`) const-folded per strategy. The non-opt search
3581    /// backends (Fast / DoubleFast / RowHash / HashChain) are dispatched
3582    /// directly off the search axis and never reach this method, so all
3583    /// strategies arriving here are HashChain-backed.
3584    fn compress_block<S: super::strategy::Strategy>(
3585        &mut self,
3586        handle_sequence: &mut impl for<'a> FnMut(Sequence<'a>),
3587    ) {
3588        debug_assert_eq!(S::BACKEND, super::strategy::BackendTag::HashChain);
3589        debug_assert!(
3590            S::USE_BT,
3591            "compress_block only handles the optimal (BT) path"
3592        );
3593        self.hc_matcher_mut()
3594            .start_matching_strategy::<S>(handle_sequence);
3595    }
3596}
3597
3598/// Stage D: backend storage discriminator.
3599///
3600/// HC (lazy / lazy2) modes carry no extra per-frame state beyond the
3601/// shared `MatchTable` and `HcMatcher` runtime knobs, so the
3602/// [`HcBackend::Hc`] variant is zero-sized — no BT scratch is
3603/// allocated. BT-flavoured modes (`btopt` / `btultra` / `btultra2`)
3604/// hold the full [`super::bt::BtMatcher`] inside the
3605/// [`HcBackend::Bt`] variant (cost model, optimal-parser scratch
3606/// arenas, LDM candidate buffer).
3607///
3608/// The discriminator lives next to `parse_mode` so `configure()` can
3609/// promote between the two on a level change without touching the
3610/// `MatchTable` storage.
3611#[derive(Clone)]
3612pub(crate) enum HcBackend {
3613    /// Lazy / lazy2 modes — no per-frame backend state.
3614    Hc,
3615    /// BT-driven modes — owns the optimal parser's per-frame scratch.
3616    /// Boxed so the enum stays pointer-sized: HC-only matchers pay
3617    /// just the `Box`-niche, not the 4 KiB `BtMatcher` payload.
3618    Bt(alloc::boxed::Box<super::bt::BtMatcher>),
3619}
3620
3621impl HcBackend {
3622    /// Heap bytes held by the backend. `Hc` is zero-sized; `Bt` boxes a
3623    /// `BtMatcher`, so count the boxed payload plus its own scratch heap.
3624    fn heap_size(&self) -> usize {
3625        match self {
3626            Self::Hc => 0,
3627            Self::Bt(bt) => core::mem::size_of::<super::bt::BtMatcher>() + bt.heap_size(),
3628        }
3629    }
3630
3631    /// Mutable accessor on the BT matcher; panics if the active
3632    /// backend is `Hc`. The HC-or-Bt branches in orchestrator code use
3633    /// `let HcBackend::Bt(bt) = &self.backend` directly for readonly
3634    /// access — this helper exists so macro bodies that already drive
3635    /// a mutable BT update through the optimal parser can write
3636    /// `$self.backend.bt_mut().X` without an outer `match` ladder.
3637    #[inline(always)]
3638    pub(crate) fn bt_mut(&mut self) -> &mut super::bt::BtMatcher {
3639        match self {
3640            Self::Bt(bt) => bt,
3641            Self::Hc => unreachable!("BT-only accessor called in HC mode"),
3642        }
3643    }
3644}
3645
3646#[derive(Clone)]
3647struct HcMatchGenerator {
3648    /// Shared match-finder storage (window, history, hash / chain /
3649    /// hash3 tables, dictionary-priming flags). Used identically by HC
3650    /// and BT modes; backend-specific table interpretation lives in the
3651    /// matcher methods on this struct.
3652    table: super::match_table::storage::MatchTable,
3653    /// HC runtime knobs (lazy_depth, search_depth, target_len). Always
3654    /// present — BT modes still consult `hc.search_depth` for repcode
3655    /// probing and chain candidate enumeration.
3656    hc: super::hc::HcMatcher,
3657    /// Backend discriminator. [`HcBackend::Hc`] is zero-sized for the
3658    /// lazy / lazy2 path so HC-only generators don't carry the BT
3659    /// optimal-parser scratch buffers. [`HcBackend::Bt`] holds the
3660    /// `BtMatcher` when an optimal mode is configured.
3661    backend: HcBackend,
3662    /// Compile-time strategy tag mirrored from
3663    /// [`MatchGeneratorDriver::strategy_tag`] during `configure()`.
3664    /// The driver hot path never reads this — it dispatches to
3665    /// `compress_block::<S>` from its own tag — but the
3666    /// `#[cfg(test)] start_matching` helper consumes it so artificial
3667    /// test setups still pick the correct concrete `S` for the
3668    /// const-generic optimal parser (BtOpt vs BtUltra vs BtUltra2).
3669    /// Without this field the test path would have to collapse
3670    /// `BtOpt` and `BtUltra` onto the same monomorphisation since
3671    /// `table.uses_bt` / `table.is_btultra2` alone can't tell them
3672    /// apart.
3673    strategy_tag: super::strategy::StrategyTag,
3674}
3675
3676// Plain-data types relocated to [`crate::encoding::opt::types`] and
3677// [`crate::encoding::opt::ldm`] by #111 Phase 1. The use statements at
3678// the top of this file bring them back into scope so the existing
3679// methods on `HcMatchGenerator` compile unchanged.
3680
3681/// `bt_insert_step_no_rebase` body parameterized over the per-CPU
3682/// `count_match_from_indices` symbol. Each kernel-specific wrapper invokes
3683/// the macro with its own `fastpath::<kernel>::count_match_from_indices`
3684/// path so the call resolves inside the wrapper's `#[target_feature]`
3685/// umbrella and inlines instead of paying the function-call ABI per BT walk
3686/// iteration. Used only by `HcMatchGenerator` BT walk wrappers below.
3687///
3688/// Crate-private: the macro body references private `encoding::*`
3689/// modules via `$crate::...`, so it is unusable downstream and is
3690/// re-exported only inside this crate via `pub(crate) use` below.
3691macro_rules! bt_insert_step_no_rebase_body {
3692    ($table:expr, $search_depth:expr, $abs_pos:ident, $current_abs_end:ident, $target_abs:ident, $cmf:path) => {{
3693        let idx = $abs_pos - $table.history_abs_start;
3694        // Borrowed-aware live region (owned: `history[history_start..]`;
3695        // borrowed: the in-place input `[0, block_end)`). Reborrow-then-raw-ptr
3696        // so the slice holds NO borrow and coexists with the `&mut $table`
3697        // binary-tree writes below. Owned is byte-identical (same bytes).
3698        let concat: &[u8] = unsafe {
3699            let lh = $table.live_history();
3700            core::slice::from_raw_parts(lh.as_ptr(), lh.len())
3701        };
3702        if idx + 8 > concat.len() {
3703            return 1;
3704        }
3705        debug_assert!(
3706            $abs_pos <= $current_abs_end,
3707            "BT walker called past current block end"
3708        );
3709        let tail_limit = $current_abs_end - $abs_pos;
3710        let hash = $crate::encoding::match_table::storage::MatchTable::hash_position_at(
3711            concat,
3712            idx,
3713            $table.hash_log,
3714            $table.search_mls,
3715        );
3716        // Prefetch the hash bucket now. For the large L16+ hash table over
3717        // high-entropy input the bucket is L3/DRAM-cold, and unlike upstream's
3718        // monolithic ZSTD_btGetAllMatches (which overlaps this miss with its
3719        // inline rep/hash3 prologue) the read+write of `hash_table[hash]`
3720        // below is reached with nothing to hide it behind — it stalled a large
3721        // share of this function's cycles. Issuing the hint here lets the miss
3722        // overlap the address setup that follows.
3723        #[cfg(all(
3724            target_feature = "sse",
3725            any(target_arch = "x86", target_arch = "x86_64")
3726        ))]
3727        {
3728            #[cfg(target_arch = "x86")]
3729            use core::arch::x86::{_MM_HINT_T0, _mm_prefetch};
3730            #[cfg(target_arch = "x86_64")]
3731            use core::arch::x86_64::{_MM_HINT_T0, _mm_prefetch};
3732            // SAFETY: prefetch is a hint that never faults; `hash` indexes
3733            // `hash_table` directly below, so it is in bounds.
3734            unsafe {
3735                _mm_prefetch($table.hash_table.as_ptr().add(hash).cast(), _MM_HINT_T0);
3736            }
3737            // Prefetch the NEXT position's bucket too. The optimal-parser DP
3738            // advances one position per iteration, so this miss is issued a
3739            // full BT walk plus the next iteration's pre-collect work ahead of
3740            // the collect that will read it — far more lead than the same-call
3741            // hint above, enough to hide the full DRAM latency.
3742            if idx + 1 + 8 <= concat.len() {
3743                let hash_next =
3744                    $crate::encoding::match_table::storage::MatchTable::hash_position_at(
3745                        concat,
3746                        idx + 1,
3747                        $table.hash_log,
3748                        $table.search_mls,
3749                    );
3750                // SAFETY: prefetch never faults; an out-of-range index is a
3751                // harmless no-op hint.
3752                unsafe {
3753                    _mm_prefetch(
3754                        $table.hash_table.as_ptr().add(hash_next).cast(),
3755                        _MM_HINT_T0,
3756                    );
3757                }
3758            }
3759        }
3760        let Some(relative_pos) = $table.relative_position($abs_pos) else {
3761            return 1;
3762        };
3763        let stored = relative_pos + 1;
3764        let bt_mask = $table.bt_mask();
3765        // `abs_pos < bt_mask` legitimately happens for the first BT walk of
3766        // a fresh frame (bt_low effectively "no floor"). Saturating keeps
3767        // the floor at 0 so the `candidate_abs <= bt_low` check never
3768        // triggers early; raw subtraction would underflow into a huge
3769        // sentinel that ALWAYS triggers.
3770        let bt_low = $abs_pos.saturating_sub(bt_mask);
3771        // Hoist the BT pointer-pair base out of `self` once — see the
3772        // collect-matches body for the full rationale (per-step Vec reload +
3773        // bounds check through `&mut self` vs the upstream zstd's raw `U32*` walk).
3774        let chain_ptr = $table.chain_table.as_mut_ptr();
3775        debug_assert_eq!($table.chain_table.len(), 2 << $table.bt_log());
3776        let window_low = $table.window_low_abs_for_target($target_abs);
3777        // `abs_pos + 9` is safe in raw form: `MatchTable::add_data` caps
3778        // total input at `usize::MAX - STREAM_ABS_HEADROOM` (where
3779        // `STREAM_ABS_HEADROOM = HC_OPT_NUM + 16`), so every
3780        // frame-lifetime absolute cursor passed to the BT walker stays
3781        // below `usize::MAX - 9` regardless of stream length or
3782        // pointer width. The guard is hoisted to the data-ingest
3783        // boundary so this per-position site pays zero arithmetic
3784        // overhead in the hot loop.
3785        let mut match_end_abs = $abs_pos + 9;
3786        let mut best_len = 8usize;
3787        let mut compares_left = $search_depth;
3788        let mut common_length_smaller = 0usize;
3789        let mut common_length_larger = 0usize;
3790        let pair_idx = $table.bt_pair_index_for_abs($abs_pos);
3791        let mut smaller_slot = pair_idx;
3792        let mut larger_slot = pair_idx + 1;
3793        let mut match_stored = $table.hash_table[hash];
3794        $table.hash_table[hash] = stored;
3795
3796        while compares_left > 0 {
3797            if match_stored == $crate::encoding::match_table::storage::HC_EMPTY {
3798                break;
3799            }
3800            // Reject stale post-rebase slots whose pre-shift position is below
3801            // `index_shift` explicitly. A `wrapping_sub` maps such a slot to a
3802            // near-`usize::MAX` value that the `>= abs_pos` test only rejects
3803            // while `abs_pos` is far from the integer ceiling; on a
3804            // long-running rebased stream (reachable on 32-bit) `abs_pos` can
3805            // approach the ceiling and the wrapped value can land back inside
3806            // `[window_low, abs_pos)`. `checked_sub` ends the walk on the
3807            // underflow instead. `match_stored != HC_EMPTY` here, so the `- 1`
3808            // cannot underflow.
3809            let Some(candidate_abs) = ($table.position_base + (match_stored as usize - 1))
3810                .checked_sub($table.index_shift)
3811            else {
3812                break;
3813            };
3814            if candidate_abs < window_low || candidate_abs >= $abs_pos {
3815                break;
3816            }
3817            compares_left -= 1;
3818
3819            let next_pair_idx = $table.bt_pair_index_for_abs(candidate_abs);
3820            // SAFETY: `next_pair_idx (+1)` = `2*(candidate_abs & bt_mask) (+1)`
3821            // ≤ `chain_table.len()-1`; `chain_ptr` is the hoisted live base,
3822            // table not realloc'd during the walk.
3823            let next_smaller = unsafe { *chain_ptr.add(next_pair_idx) };
3824            let next_larger = unsafe { *chain_ptr.add(next_pair_idx + 1) };
3825            let seed_len = common_length_smaller.min(common_length_larger);
3826            let candidate_idx = candidate_abs - $table.history_abs_start;
3827            // SAFETY: BT walk invariant — `candidate_idx + tail_limit ≤
3828            // concat.len()` since the candidate is within
3829            // `[history_abs_start, abs_pos)` and `tail_limit ≤
3830            // current_abs_end - abs_pos`.
3831            let match_len = unsafe { $cmf(concat, idx, candidate_idx, tail_limit, seed_len) };
3832
3833            if match_len > best_len {
3834                best_len = match_len;
3835                // `candidate_abs + match_len <= current_abs_end` by BT walk
3836                // invariant — `match_len <= tail_limit = current_abs_end -
3837                // abs_pos` and `candidate_abs < abs_pos`.
3838                let candidate_end = candidate_abs + match_len;
3839                if candidate_end > match_end_abs {
3840                    match_end_abs = candidate_end;
3841                }
3842            }
3843
3844            if match_len >= tail_limit {
3845                break;
3846            }
3847
3848            let candidate_next = candidate_idx + match_len;
3849            let current_next = idx + match_len;
3850            // SAFETY: first-differing positions after a match_len-long prefix;
3851            // match_len < tail_limit (break above) + BT-walk bound
3852            // idx/candidate_idx + tail_limit <= concat.len() keep both in range.
3853            if unsafe {
3854                *concat.get_unchecked(candidate_next) < *concat.get_unchecked(current_next)
3855            } {
3856                // SAFETY: `smaller_slot` holds a valid pair index (init
3857                // `pair_idx`, updated to `next_pair_idx + 1`); the `usize::MAX`
3858                // sentinel is set only just before `break`, never written here.
3859                unsafe { *chain_ptr.add(smaller_slot) = match_stored };
3860                common_length_smaller = match_len;
3861                if candidate_abs <= bt_low {
3862                    smaller_slot = usize::MAX;
3863                    break;
3864                }
3865                smaller_slot = next_pair_idx + 1;
3866                match_stored = next_larger;
3867            } else {
3868                // SAFETY: as above for `larger_slot`.
3869                unsafe { *chain_ptr.add(larger_slot) = match_stored };
3870                common_length_larger = match_len;
3871                if candidate_abs <= bt_low {
3872                    larger_slot = usize::MAX;
3873                    break;
3874                }
3875                larger_slot = next_pair_idx;
3876                match_stored = next_smaller;
3877            }
3878        }
3879
3880        // SAFETY: both slots, when not the `usize::MAX` sentinel, hold valid
3881        // pair indices into the hoisted `chain_table` base.
3882        if smaller_slot != usize::MAX {
3883            unsafe {
3884                *chain_ptr.add(smaller_slot) = $crate::encoding::match_table::storage::HC_EMPTY
3885            };
3886        }
3887        if larger_slot != usize::MAX {
3888            unsafe {
3889                *chain_ptr.add(larger_slot) = $crate::encoding::match_table::storage::HC_EMPTY
3890            };
3891        }
3892
3893        let speed_positions = if best_len > 384 {
3894            (best_len - 384).min(192)
3895        } else {
3896            0
3897        };
3898        // `match_end_abs` is initialized to `abs_pos + 9` and is only
3899        // reassigned inside the `candidate_end > match_end_abs` branch
3900        // above. So even though an individual `candidate_end =
3901        // candidate_abs + match_len` can land below `abs_pos` (the
3902        // candidate sits earlier in history and the match runs short),
3903        // the variable itself never drops below its initial value.
3904        // That gives `match_end_abs ≥ abs_pos + 9 > abs_pos + 8` as a
3905        // loop-wide invariant, so the raw subtraction below cannot
3906        // underflow.
3907        speed_positions.max(match_end_abs - ($abs_pos + 8))
3908    }};
3909}
3910pub(crate) use bt_insert_step_no_rebase_body;
3911
3912/// `build_optimal_plan_impl` body parameterized over the per-CPU
3913/// `collect_optimal_candidates_initialized_<kernel>` method name. Caller
3914/// passes its `&mut self`, the seven DP entry-point arguments, and the
3915/// kernel-specific collect method. Each per-kernel wrapper invokes this
3916/// macro inside its own `#[target_feature]` umbrella so the per-position
3917/// `$collect` call inlines and the entire DP loop runs as one straight-line
3918/// hot path without an ABI barrier between the DP and the match-gathering
3919/// pipeline.
3920///
3921/// Body is ~730 lines but mechanically identical across kernels — the macro
3922/// keeps a single source of truth. The two const generics
3923/// (`ACCURATE_PRICE`, `FAVOR_SMALL_OFFSETS`) come from the wrapper's
3924/// generic parameter list and are referenced as bare identifiers; macro
3925/// hygiene resolves them at the expansion site.
3926/// Upstream zstd `offBase` for the btlazy2 lazy gain heuristic: a match whose offset
3927/// equals one of the three active repeat offsets prices as the cheap repcode
3928/// code (1/2/3); any other offset prices as `offset + 3`. So an equal-length
3929/// repeat-offset match always out-gains an explicit-offset one
3930/// (`zstd_lazy.c` `ZSTD_storeSeq` offBase convention).
3931#[inline]
3932fn btlazy2_offbase(offset: usize, reps: [u32; 3], ll0: bool) -> u32 {
3933    let o = offset as u32;
3934    // Upstream zstd repcode mapping shifts by `ll0` (zero-literal position): the cheap
3935    // codes become rep1 / rep2 / (rep0 - 1) instead of rep0 / rep1 / rep2,
3936    // because at ll0 an offset equal to rep0 is the special rep0-1 case, not
3937    // repcode 1. Scoring offsets against the wrong code at ll0 over-rewards a
3938    // rep0-distance match that does not actually encode as the cheapest code.
3939    if ll0 {
3940        if o == reps[1] {
3941            1
3942        } else if o == reps[2] {
3943            2
3944        } else if reps[0] > 1 && o == reps[0] - 1 {
3945            3
3946        } else {
3947            // Offsets are < window (<= 2^27), so `+ 3` never overflows u32.
3948            o + 3
3949        }
3950    } else if o == reps[0] {
3951        1
3952    } else if o == reps[1] {
3953        2
3954    } else if o == reps[2] {
3955        3
3956    } else {
3957        // Offsets are < window (<= 2^27), so `+ 3` never overflows u32.
3958        o + 3
3959    }
3960}
3961
3962/// Upstream zstd lazy match gain (`matchLength * 4 - ZSTD_highbit32(offBase)`): the
3963/// selection metric that lets a shorter repeat-offset match beat a longer
3964/// explicit-offset one. `offBase >= 1`, so `highbit` is well-defined.
3965#[inline]
3966fn btlazy2_gain(match_len: usize, offset: usize, reps: [u32; 3], ll0: bool) -> i64 {
3967    let offbase = btlazy2_offbase(offset, reps, ll0);
3968    (match_len as i64) * 4 - (31 - offbase.leading_zeros()) as i64
3969}
3970
3971/// Per-kernel body of the `btlazy2` (levels 13-15) greedy/lazy parse over
3972/// the binary-tree match finder. Mirrors `build_optimal_plan_impl_body!`'s
3973/// kernel-dispatch discipline: the wrapper carries the `#[target_feature]`
3974/// umbrella and passes its tier-specific `collect_optimal_candidates_initialized_<kernel>`
3975/// as `$collect`, so the per-position BT collect (and its inlined cpl)
3976/// stays under one umbrella — the runtime `select_kernel()` dispatch happens
3977/// ONCE per block in the bare `start_matching_btlazy2`, never per position.
3978macro_rules! start_matching_btlazy2_body {
3979    ($self:ident, $handle_sequence:ident, $collect:ident, $cmf:path $(,)?) => {{
3980        $self.table.ensure_tables();
3981        // Borrowed-aware: owned → last committed chunk; borrowed → staged block.
3982        let (current_abs_start, current_len) = $self.table.current_block_range();
3983        if current_len == 0 {
3984            return;
3985        }
3986        let current_ptr = $self.table.get_last_space().as_ptr();
3987        // Mutates tables but never reallocates `history`, so this tail slice
3988        // stays valid for the routine's duration (same as the other parsers).
3989        let current: &[u8] = unsafe { core::slice::from_raw_parts(current_ptr, current_len) };
3990        // Full contiguous live region (owned: dict + prior blocks + current
3991        // block in `history`; borrowed: `[0, block_end)` of the in-place
3992        // input) as a raw slice, for the explicit repcode probe: a rep offset
3993        // can point before the current block, which `current` can't reach.
3994        // `live_history()` is borrowed-aware; reborrow-then-raw-ptr so the
3995        // slice holds NO borrow and coexists with the `&mut self` collector
3996        // calls below. Same no-realloc validity contract as `current`.
3997        let history_abs_start = $self.table.history_abs_start;
3998        let concat_full: &[u8] = unsafe {
3999            let lh = $self.table.live_history();
4000            core::slice::from_raw_parts(lh.as_ptr(), lh.len())
4001        };
4002        let current_abs_end = current_abs_start + current_len;
4003        $self
4004            .table
4005            .apply_limited_update_after_long_match(current_abs_start);
4006        $self
4007            .table
4008            .backfill_boundary_positions(current_abs_start, current_abs_end);
4009
4010        let profile = HcOptimalCostProfile::const_for_strategy::<super::strategy::Btlazy2>();
4011        let mut candidates = core::mem::take(&mut $self.backend.bt_mut().opt_candidates_scratch);
4012
4013        let depth = $self.hc.lazy_depth as usize;
4014        let mut pos = 0usize;
4015        let mut literals_start = 0usize;
4016
4017        // Collect + select the highest-GAIN match at a position (upstream zstd
4018        // `ZSTD_searchMax` plus the explicit offset_1 repcode check): scan the
4019        // length-sorted BT/dms ladder by gain, then probe rep0 directly since
4020        // the ladder's strictly-increasing-length filter drops short cheap
4021        // reps. Expands to `(match_len, offset)`; `match_len == 0` = no match.
4022        macro_rules! bt_select {
4023            ($p:expr) => {{
4024                let sel_pos: usize = $p;
4025                // `ll0` (upstream zstd): zero literals pending before this position, so
4026                // the repcode set is shifted (see `btlazy2_offbase`).
4027                let ll0 = sel_pos == literals_start;
4028                let sel_abs = current_abs_start + sel_pos;
4029                candidates.clear();
4030                let query = HcCandidateQuery {
4031                    reps: $self.table.offset_hist,
4032                    lit_len: sel_pos - literals_start,
4033                    // No LDM seed: L13-15 run at windowLog 22, below upstream zstd's
4034                    // LDM auto-enable threshold (windowLog >= 27).
4035                    ldm_candidate: None,
4036                };
4037                // SAFETY: called inside the wrapper's `#[target_feature]`
4038                // umbrella (the scalar wrapper's `$collect` is a safe fn).
4039                unsafe {
4040                    $self.$collect::<super::strategy::Btlazy2, true>(
4041                        sel_abs,
4042                        current_abs_end,
4043                        profile,
4044                        query,
4045                        &mut candidates,
4046                    );
4047                }
4048                let reps = $self.table.offset_hist;
4049                let mut sel_ml = 0usize;
4050                let mut sel_off = 0usize;
4051                let mut sel_gain = i64::MIN;
4052                for c in candidates.iter() {
4053                    let ml = c.match_len.min(current_len - sel_pos);
4054                    if ml < HC_OPT_MIN_MATCH_LEN {
4055                        continue;
4056                    }
4057                    let g = btlazy2_gain(ml, c.offset, reps, ll0);
4058                    if g > sel_gain {
4059                        sel_gain = g;
4060                        sel_ml = ml;
4061                        sel_off = c.offset;
4062                    }
4063                }
4064                let sel_idx = sel_abs - history_abs_start;
4065                // Upstream zstd probes `rep[0 + ll0]` directly (the length-sorted ladder
4066                // drops short cheap reps): rep0 normally, rep1 at a zero-literal
4067                // position where rep0 is not the cheapest code.
4068                let probe_rep = if ll0 {
4069                    reps[1] as usize
4070                } else {
4071                    reps[0] as usize
4072                };
4073                if probe_rep != 0 && sel_idx >= probe_rep {
4074                    let tail = current_len - sel_pos;
4075                    // SAFETY: `sel_idx - probe_rep < sel_idx`, `sel_idx + tail <=
4076                    // concat_full.len()`; same overshoot slack the collector
4077                    // relies on for this block.
4078                    let rep_ml =
4079                        unsafe { $cmf(concat_full, sel_idx, sel_idx - probe_rep, tail, 0) };
4080                    if rep_ml >= HC_OPT_MIN_MATCH_LEN
4081                        && btlazy2_gain(rep_ml, probe_rep, reps, ll0) > sel_gain
4082                    {
4083                        sel_ml = rep_ml;
4084                        sel_off = probe_rep;
4085                    }
4086                }
4087                (sel_ml, sel_off)
4088            }};
4089        }
4090
4091        while pos + HC_OPT_MIN_MATCH_LEN <= current_len {
4092            let (mut best_ml, mut best_off) = bt_select!(pos);
4093            if best_ml < HC_OPT_MIN_MATCH_LEN {
4094                pos += 1;
4095                continue;
4096            }
4097            // Lazy lookahead (upstream zstd depth 1/2): advance one byte and accept the
4098            // later match only if it out-gains the current one by the upstream zstd
4099            // margin (deferring costs an extra literal — `+4` at depth 1, `+7`
4100            // at depth 2). `start` tracks where the chosen match begins.
4101            let mut start = pos;
4102            let mut d = 0usize;
4103            while d < depth && start + 1 + HC_OPT_MIN_MATCH_LEN <= current_len {
4104                let look = start + 1;
4105                let (ml2, off2) = bt_select!(look);
4106                if ml2 < HC_OPT_MIN_MATCH_LEN {
4107                    break;
4108                }
4109                let reps = $self.table.offset_hist;
4110                let margin = if d == 0 { 4 } else { 7 };
4111                // `best` sits at `start` (ll0 iff no literals precede it); the
4112                // lookahead match at `start + 1` always has a pending literal.
4113                let gain1 = btlazy2_gain(best_ml, best_off, reps, start == literals_start) + margin;
4114                let gain2 = btlazy2_gain(ml2, off2, reps, false);
4115                if gain2 > gain1 {
4116                    best_ml = ml2;
4117                    best_off = off2;
4118                    start = look;
4119                    d += 1;
4120                } else {
4121                    break;
4122                }
4123            }
4124            // Commit the chosen match at `start`; [literals_start, start) is
4125            // emitted as literals. `best_ml` was bounded to `current_len -
4126            // start` at selection, so `start + best_ml <= current_len`.
4127            let lit_len = start - literals_start;
4128            let literals = &current[literals_start..start];
4129            $handle_sequence(Sequence::Triple {
4130                literals,
4131                offset: best_off,
4132                match_len: best_ml,
4133            });
4134            let _ = encode_offset_with_history(
4135                best_off as u32,
4136                lit_len as u32,
4137                &mut $self.table.offset_hist,
4138            );
4139            pos = start + best_ml;
4140            literals_start = pos;
4141        }
4142
4143        if literals_start < current_len {
4144            $handle_sequence(Sequence::Literals {
4145                literals: &current[literals_start..],
4146            });
4147        }
4148        $self.backend.bt_mut().opt_candidates_scratch = candidates;
4149    }};
4150}
4151
4152/// 8-lane `next_cost < node_price` mask for the optimal-parser price-set
4153/// loop. AVX2 lacks an unsigned `cmplt`, so derive `nc < np` from
4154/// `min_epu32`: `nc <= np` iff `min(nc,np) == nc`, then exclude equality.
4155/// Returns a bitmask (bit `k` set => lane `k` improves). Scalar fallback
4156/// for non-x86 / no-AVX2.
4157/// 8-lane `next_cost < node_price` mask for the optimal-parser price-set
4158/// loop. AVX2 lacks an unsigned `cmplt`, so derive `nc < np` from
4159/// `min_epu32`: `nc <= np` iff `min(nc,np) == nc`, then exclude equality.
4160/// Returns a bitmask (bit `k` set => lane `k` improves). Compiled on every
4161/// x86 target (same as the avx2 collect kernel); the cargo `kernel_avx2`
4162/// feature only gates the runtime dispatch, not compilation.
4163#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
4164#[target_feature(enable = "avx2")]
4165unsafe fn priceset_improved_mask8_avx2(next_cost: &[u32; 8], node_price: &[u32]) -> u8 {
4166    #[cfg(target_arch = "x86")]
4167    use core::arch::x86::{
4168        __m256i, _mm256_andnot_si256, _mm256_castsi256_ps, _mm256_cmpeq_epi32, _mm256_loadu_si256,
4169        _mm256_min_epu32, _mm256_movemask_ps,
4170    };
4171    #[cfg(target_arch = "x86_64")]
4172    use core::arch::x86_64::{
4173        __m256i, _mm256_andnot_si256, _mm256_castsi256_ps, _mm256_cmpeq_epi32, _mm256_loadu_si256,
4174        _mm256_min_epu32, _mm256_movemask_ps,
4175    };
4176    let nc = unsafe { _mm256_loadu_si256(next_cost.as_ptr() as *const __m256i) };
4177    let np = unsafe { _mm256_loadu_si256(node_price.as_ptr() as *const __m256i) };
4178    let min = _mm256_min_epu32(nc, np);
4179    let le = _mm256_cmpeq_epi32(min, nc); // nc <= np
4180    let eq = _mm256_cmpeq_epi32(nc, np); // nc == np
4181    let lt = _mm256_andnot_si256(eq, le); // nc < np
4182    _mm256_movemask_ps(_mm256_castsi256_ps(lt)) as u8
4183}
4184
4185/// Inline `next_cost = base_cost + ll0_price + match_price_from_parts(off,ml)`
4186/// for one match length — the exact `add_prices` chain the scalar loop uses,
4187/// so the SoA vector path stays byte-identical.
4188#[inline(always)]
4189#[allow(clippy::too_many_arguments)]
4190fn priceset_next_cost(
4191    profile: HcOptimalCostProfile,
4192    stats: &HcOptState,
4193    ml_cache: &mut [[u32; 2]],
4194    ml_stamp: u32,
4195    match_len: usize,
4196    ll0_price: u32,
4197    off_price: u32,
4198    base_cost: u32,
4199) -> u32 {
4200    let ml_price =
4201        BtMatcher::cached_match_length_price(profile, stats, match_len, ml_cache, ml_stamp);
4202    let seq_cost = BtMatcher::add_prices(
4203        ll0_price,
4204        profile.match_price_from_parts(off_price, ml_price, stats),
4205    );
4206    BtMatcher::add_prices(base_cost, seq_cost)
4207}
4208
4209/// Scalar price-set over the match-length range `[start, max]` for the
4210/// NON-abort optimal modes (btultra / btultra2). Each `match_len` writes a
4211/// distinct node `pos + match_len`, so order is irrelevant; the improvement
4212/// test reduces to `next_cost < node_prices[next]` (`reset_opt_nodes` set
4213/// every beyond-frontier cell to `u32::MAX`, subsuming `next > last_pos`).
4214/// `#[inline]` so it folds into each per-tier optimal-parser monomorphisation
4215/// (no call overhead). Returns the highest written `next`.
4216#[inline]
4217#[allow(clippy::too_many_arguments)]
4218// Used by the scalar / sse42 DP wrappers; on aarch64 the dispatch only reaches
4219// the neon wrapper and on wasm+simd128 only the simd128 wrapper, so this is
4220// cfg-dead on those targets.
4221#[cfg_attr(
4222    any(
4223        all(target_arch = "aarch64", target_endian = "little"),
4224        all(target_arch = "wasm32", target_feature = "simd128")
4225    ),
4226    allow(dead_code)
4227)]
4228fn priceset_range_nonabort_scalar(
4229    node_prices: &mut [u32],
4230    nodes: &mut [HcOptimalNode],
4231    ml_cache: &mut [[u32; 2]],
4232    ml_stamp: u32,
4233    profile: HcOptimalCostProfile,
4234    stats: &HcOptState,
4235    pos: usize,
4236    start: usize,
4237    max: usize,
4238    ll0_price: u32,
4239    off_price: u32,
4240    base_cost: u32,
4241    off: u32,
4242    reps: [u32; 3],
4243    last_pos: usize,
4244) -> usize {
4245    let mut new_last = last_pos;
4246    for ml in start..=max {
4247        let next_cost = priceset_next_cost(
4248            profile, stats, ml_cache, ml_stamp, ml, ll0_price, off_price, base_cost,
4249        );
4250        let next = pos + ml;
4251        if next_cost < node_prices[next] {
4252            node_prices[next] = next_cost;
4253            nodes[next] = HcOptimalNode {
4254                off,
4255                mlen: ml as u32,
4256                litlen: 0,
4257                reps,
4258            };
4259            if next > new_last {
4260                new_last = next;
4261            }
4262        }
4263    }
4264    new_last
4265}
4266
4267/// Per-tier deinterleave + improve-mask correctness vs a scalar reference.
4268/// Each tier's dispatch only fires on matching hardware (i9 picks AVX2 over
4269/// SSE4.1, M1 picks NEON), so the non-dispatched tiers never run in the
4270/// roundtrip suite; this exercises the deinterleave/mask helpers directly on
4271/// whatever ISA the test host exposes (AVX2 + SSE4.1 on x86, NEON on aarch64).
4272#[cfg(test)]
4273#[test]
4274fn priceset_tier_helpers_match_scalar() {
4275    // Reference: gen-stamped contiguous cells -> ordered prices on all-warm.
4276    fn scalar_deint<const W: usize>(cells: &[[u32; 2]], stamp: u32) -> Option<[u32; W]> {
4277        let mut out = [0u32; W];
4278        for k in 0..W {
4279            if cells[k][1] != stamp {
4280                return None;
4281            }
4282            out[k] = cells[k][0];
4283        }
4284        Some(out)
4285    }
4286    fn scalar_mask<const W: usize>(nc: &[u32; W], np: &[u32]) -> u8 {
4287        let mut m = 0u8;
4288        for k in 0..W {
4289            if nc[k] < np[k] {
4290                m |= 1 << k;
4291            }
4292        }
4293        m
4294    }
4295    const S: u32 = 0x55;
4296    let warm: [[u32; 2]; 4] = [[11, S], [22, S], [33, S], [44, S]];
4297    let mut cold = warm;
4298    cold[2][1] = S ^ 1; // one stale cell -> must yield None
4299    let nc4: [u32; 4] = [10, 99, 30, 41];
4300    let np4: [u32; 4] = [20, 21, 30, 99]; // lt: lane0 (10<20), lane3 (41<99)
4301
4302    #[cfg(all(target_arch = "aarch64", target_endian = "little"))]
4303    unsafe {
4304        assert_eq!(
4305            priceset_cached_prices4_neon(&warm, S),
4306            scalar_deint::<4>(&warm, S)
4307        );
4308        assert_eq!(priceset_cached_prices4_neon(&cold, S), None);
4309        assert_eq!(
4310            priceset_improved_mask4_neon(&nc4, &np4),
4311            scalar_mask::<4>(&nc4, &np4)
4312        );
4313    }
4314    #[cfg(all(feature = "std", any(target_arch = "x86", target_arch = "x86_64")))]
4315    {
4316        if std::is_x86_feature_detected!("sse4.2") {
4317            unsafe {
4318                assert_eq!(
4319                    priceset_cached_prices4_sse41(&warm, S),
4320                    scalar_deint::<4>(&warm, S)
4321                );
4322                assert_eq!(priceset_cached_prices4_sse41(&cold, S), None);
4323                assert_eq!(
4324                    priceset_improved_mask4_sse41(&nc4, &np4),
4325                    scalar_mask::<4>(&nc4, &np4)
4326                );
4327            }
4328        }
4329        if std::is_x86_feature_detected!("avx2") {
4330            let warm8: [[u32; 2]; 8] = [
4331                [11, S],
4332                [22, S],
4333                [33, S],
4334                [44, S],
4335                [55, S],
4336                [66, S],
4337                [77, S],
4338                [88, S],
4339            ];
4340            let mut cold8 = warm8;
4341            cold8[5][1] = S ^ 1;
4342            let nc8: [u32; 8] = [10, 99, 30, 41, 99, 60, 99, 80];
4343            let np8: [u32; 8] = [20, 21, 30, 99, 50, 99, 70, 99];
4344            unsafe {
4345                assert_eq!(
4346                    priceset_cached_prices8_avx2(&warm8, S),
4347                    scalar_deint::<8>(&warm8, S)
4348                );
4349                assert_eq!(priceset_cached_prices8_avx2(&cold8, S), None);
4350                assert_eq!(
4351                    priceset_improved_mask8_avx2(&nc8, &np8),
4352                    scalar_mask::<8>(&nc8, &np8)
4353                );
4354            }
4355        }
4356    }
4357}
4358
4359/// Shared vectorised price-set loop body, generic over the SIMD width `W`.
4360/// The per-tier `deint` (vector-load plus deinterleave of `W` cached prices,
4361/// returning `Some` only on an all-warm chunk) and `mask` (per-tier
4362/// `next_cost` less-than `node_price` bitmask) are passed as zero-sized
4363/// `impl Fn`s. `#[inline(always)]` plus monomorphisation folds `deint` and
4364/// `mask` directly into each per-tier wrapper's `target_feature` umbrella, so
4365/// the intrinsics inline with no call ABI and no runtime feature detection.
4366/// Cold or out-of-cache chunks, and the sub-`W` remainder, fall back to the
4367/// scalar `priceset_next_cost` (which fills the cache); writes are
4368/// scalar-scatter on the improving lanes (1-8% of compares, per the
4369/// improve-ratio probe). Same signature tail as the scalar variant.
4370#[inline(always)]
4371#[allow(clippy::too_many_arguments)]
4372// Instantiated only by a vector tier wrapper (avx2/sse4.1 on x86, neon on
4373// aarch64, simd128 on wasm+simd128); a target with none of those (e.g.
4374// wasm without +simd128) uses only the scalar range, leaving this generic dead.
4375#[cfg_attr(
4376    not(any(
4377        target_arch = "x86",
4378        target_arch = "x86_64",
4379        all(target_arch = "aarch64", target_endian = "little"),
4380        all(target_arch = "wasm32", target_feature = "simd128")
4381    )),
4382    allow(dead_code)
4383)]
4384fn priceset_range_vec<const W: usize>(
4385    node_prices: &mut [u32],
4386    nodes: &mut [HcOptimalNode],
4387    ml_cache: &mut [[u32; 2]],
4388    ml_stamp: u32,
4389    profile: HcOptimalCostProfile,
4390    stats: &HcOptState,
4391    pos: usize,
4392    start: usize,
4393    max: usize,
4394    ll0_price: u32,
4395    off_price: u32,
4396    base_cost: u32,
4397    off: u32,
4398    reps: [u32; 3],
4399    last_pos: usize,
4400    deint: impl Fn(&[[u32; 2]], u32) -> Option<[u32; W]>,
4401    mask: impl Fn(&[u32; W], &[u32]) -> u8,
4402) -> usize {
4403    let mut new_last = last_pos;
4404    let mut buf = [0u32; W];
4405    // Loop-invariant constant of the byte-identical next_cost chain:
4406    // next_cost = add_prices(base_cost, add_prices(ll0_price,
4407    //   match_price_from_parts(off_price, ml_price))) = c_base + ml_price,
4408    // c_base = base_cost + ll0_price + match_price_from_parts(off_price, 0).
4409    //
4410    // This stays bit-exact with the scalar `priceset_next_cost` because both
4411    // helpers are affine in `ml_price`: `BtMatcher::add_prices(a, b) = a + b`
4412    // and `match_price_from_parts(off, ml) = off + ml + bias` are plain integer
4413    // additions, so `match_price_from_parts(off, ml) = match_price_from_parts(
4414    // off, 0) + ml` and the whole chain collapses to `c_base + ml_price`. The
4415    // `wrapping_add` here matches the scalar `+` under the cost model's
4416    // no-overflow invariant (the `debug_assert`s in both helpers). Factoring the
4417    // combine into one helper per the review suggestion would force a per-lane
4418    // `match_price_from_parts(off, ml_price)` recompute instead of hoisting the
4419    // ml-independent `c_base` once — a regression on this hot DP loop — so the
4420    // hoist is kept and the equivalence documented here instead.
4421    let c_base = base_cost
4422        .wrapping_add(ll0_price)
4423        .wrapping_add(profile.match_price_from_parts(off_price, 0, stats));
4424    let mut ml = start;
4425    while ml + W <= max + 1 {
4426        let vectorised = if ml + W <= ml_cache.len() {
4427            deint(&ml_cache[ml..ml + W], ml_stamp)
4428        } else {
4429            None
4430        };
4431        if let Some(prices) = vectorised {
4432            for (k, slot) in buf.iter_mut().enumerate() {
4433                *slot = c_base.wrapping_add(prices[k]);
4434            }
4435        } else {
4436            for (k, slot) in buf.iter_mut().enumerate() {
4437                *slot = priceset_next_cost(
4438                    profile,
4439                    stats,
4440                    ml_cache,
4441                    ml_stamp,
4442                    ml + k,
4443                    ll0_price,
4444                    off_price,
4445                    base_cost,
4446                );
4447            }
4448        }
4449        let base_next = pos + ml;
4450        let mut bits = mask(&buf, &node_prices[base_next..base_next + W]);
4451        while bits != 0 {
4452            let k = bits.trailing_zeros() as usize;
4453            bits &= bits - 1;
4454            let next = base_next + k;
4455            node_prices[next] = buf[k];
4456            nodes[next] = HcOptimalNode {
4457                off,
4458                mlen: (ml + k) as u32,
4459                litlen: 0,
4460                reps,
4461            };
4462            if next > new_last {
4463                new_last = next;
4464            }
4465        }
4466        ml += W;
4467    }
4468    while ml <= max {
4469        let next_cost = priceset_next_cost(
4470            profile, stats, ml_cache, ml_stamp, ml, ll0_price, off_price, base_cost,
4471        );
4472        let next = pos + ml;
4473        if next_cost < node_prices[next] {
4474            node_prices[next] = next_cost;
4475            nodes[next] = HcOptimalNode {
4476                off,
4477                mlen: ml as u32,
4478                litlen: 0,
4479                reps,
4480            };
4481            if next > new_last {
4482                new_last = next;
4483            }
4484        }
4485        ml += 1;
4486    }
4487    new_last
4488}
4489
4490/// Vector-load 8 cached ml-prices for the optimal parser's price-set, given a
4491/// run of 8 contiguous `[price, generation]` cells. Returns `Some(prices)`
4492/// only when ALL eight cells are warm (`generation == stamp`) — the common
4493/// (~91-98%) case — so the caller can fold them with one broadcast constant;
4494/// any cold cell returns `None` to route the chunk through the scalar fill
4495/// (which recomputes + repopulates the misses). Deinterleaves with cheap
4496/// in-128-lane ops (`shuffle_epi32` + `unpack*_epi64`) and a single cross-lane
4497/// `permute4x64` for the ordered prices — avoiding the latency-bound chain of
4498/// cross-lane `permutevar8x32`s that lost to pipelined scalar loads on
4499/// high-chunk-count fixtures.
4500#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
4501#[target_feature(enable = "avx2")]
4502#[inline]
4503unsafe fn priceset_cached_prices8_avx2(cells: &[[u32; 2]], stamp: u32) -> Option<[u32; 8]> {
4504    #[cfg(target_arch = "x86")]
4505    use core::arch::x86::{
4506        __m256i, _mm256_castsi256_ps, _mm256_cmpeq_epi32, _mm256_loadu_si256, _mm256_movemask_ps,
4507        _mm256_permute4x64_epi64, _mm256_set1_epi32, _mm256_shuffle_epi32, _mm256_storeu_si256,
4508        _mm256_unpackhi_epi64, _mm256_unpacklo_epi64,
4509    };
4510    #[cfg(target_arch = "x86_64")]
4511    use core::arch::x86_64::{
4512        __m256i, _mm256_castsi256_ps, _mm256_cmpeq_epi32, _mm256_loadu_si256, _mm256_movemask_ps,
4513        _mm256_permute4x64_epi64, _mm256_set1_epi32, _mm256_shuffle_epi32, _mm256_storeu_si256,
4514        _mm256_unpackhi_epi64, _mm256_unpacklo_epi64,
4515    };
4516    debug_assert!(cells.len() >= 8);
4517    let base = cells.as_ptr() as *const __m256i;
4518    // v0 = [p0 g0 p1 g1 | p2 g2 p3 g3], v1 = [p4 g4 p5 g5 | p6 g6 p7 g7].
4519    let v0 = unsafe { _mm256_loadu_si256(base) };
4520    let v1 = unsafe { _mm256_loadu_si256(base.add(1)) };
4521    // In-128-lane group prices then gens: [p g p g] -> [p p g g] (control 0xD8).
4522    let s0 = _mm256_shuffle_epi32(v0, 0xD8); // [p0 p1 g0 g1 | p2 p3 g2 g3]
4523    let s1 = _mm256_shuffle_epi32(v1, 0xD8); // [p4 p5 g4 g5 | p6 p7 g6 g7]
4524    // Gens (hi 64 of each 128-lane) — order irrelevant for the all-equal test.
4525    let gens = _mm256_unpackhi_epi64(s0, s1);
4526    let eq = _mm256_cmpeq_epi32(gens, _mm256_set1_epi32(stamp as i32));
4527    if _mm256_movemask_ps(_mm256_castsi256_ps(eq)) as u8 != 0xFF {
4528        return None;
4529    }
4530    // Prices (lo 64 of each 128-lane): [p0 p1 p4 p5 | p2 p3 p6 p7] as 64-bit
4531    // chunks [c0 c1 c2 c3] = [p0p1 p4p5 p2p3 p6p7]; reorder to [c0 c2 c1 c3]
4532    // (control 0xD8) for in-order [p0..p7].
4533    let p_scrambled = _mm256_unpacklo_epi64(s0, s1);
4534    let prices = _mm256_permute4x64_epi64(p_scrambled, 0xD8);
4535    let mut out = [0u32; 8];
4536    unsafe { _mm256_storeu_si256(out.as_mut_ptr() as *mut __m256i, prices) };
4537    Some(out)
4538}
4539
4540#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
4541#[target_feature(enable = "avx2")]
4542#[inline]
4543#[allow(clippy::too_many_arguments)]
4544unsafe fn priceset_range_nonabort_avx2(
4545    node_prices: &mut [u32],
4546    nodes: &mut [HcOptimalNode],
4547    ml_cache: &mut [[u32; 2]],
4548    ml_stamp: u32,
4549    profile: HcOptimalCostProfile,
4550    stats: &HcOptState,
4551    pos: usize,
4552    start: usize,
4553    max: usize,
4554    ll0_price: u32,
4555    off_price: u32,
4556    base_cost: u32,
4557    off: u32,
4558    reps: [u32; 3],
4559    last_pos: usize,
4560) -> usize {
4561    priceset_range_vec::<8>(
4562        node_prices,
4563        nodes,
4564        ml_cache,
4565        ml_stamp,
4566        profile,
4567        stats,
4568        pos,
4569        start,
4570        max,
4571        ll0_price,
4572        off_price,
4573        base_cost,
4574        off,
4575        reps,
4576        last_pos,
4577        // SAFETY: both closures run inside this fn's avx2 target_feature umbrella.
4578        |cells, stamp| unsafe { priceset_cached_prices8_avx2(cells, stamp) },
4579        |nc, np| unsafe { priceset_improved_mask8_avx2(nc, np) },
4580    )
4581}
4582
4583/// NEON 4-lane vector-load + deinterleave of cached ml-prices. `vld2q_u32`
4584/// deinterleaves the 4 contiguous `[price, generation]` pairs natively into
4585/// two registers (prices, gens) — no shuffle chain. `Some(prices)` only when
4586/// all 4 generations equal `stamp` (`vminvq` of the equality mask is all-ones).
4587#[cfg(all(target_arch = "aarch64", target_endian = "little"))]
4588#[target_feature(enable = "neon")]
4589#[inline]
4590unsafe fn priceset_cached_prices4_neon(cells: &[[u32; 2]], stamp: u32) -> Option<[u32; 4]> {
4591    use core::arch::aarch64::{vceqq_u32, vdupq_n_u32, vld2q_u32, vminvq_u32, vst1q_u32};
4592    debug_assert!(cells.len() >= 4);
4593    // SAFETY: caller's neon umbrella; `cells` is >= 4 pairs = 8 contiguous u32.
4594    let pair = unsafe { vld2q_u32(cells.as_ptr() as *const u32) };
4595    let eq = vceqq_u32(pair.1, vdupq_n_u32(stamp));
4596    if vminvq_u32(eq) != u32::MAX {
4597        return None;
4598    }
4599    let mut out = [0u32; 4];
4600    unsafe { vst1q_u32(out.as_mut_ptr(), pair.0) };
4601    Some(out)
4602}
4603
4604/// NEON 4-lane `next_cost < node_price` bitmask. NEON has an unsigned compare
4605/// (`vcltq_u32`) but no movemask; AND the all-ones lane mask with lane weights
4606/// `[1,2,4,8]` and horizontal-add (`vaddvq_u32`) to pack the 4 bits.
4607#[cfg(all(target_arch = "aarch64", target_endian = "little"))]
4608#[target_feature(enable = "neon")]
4609#[inline]
4610unsafe fn priceset_improved_mask4_neon(next_cost: &[u32; 4], node_price: &[u32]) -> u8 {
4611    use core::arch::aarch64::{vaddvq_u32, vandq_u32, vcltq_u32, vld1q_u32, vst1q_u32};
4612    // SAFETY: neon umbrella; both spans are 4 u32 wide.
4613    let nc = unsafe { vld1q_u32(next_cost.as_ptr()) };
4614    let np = unsafe { vld1q_u32(node_price.as_ptr()) };
4615    let lt = vcltq_u32(nc, np);
4616    let weights: [u32; 4] = [1, 2, 4, 8];
4617    let w = unsafe { vld1q_u32(weights.as_ptr()) };
4618    let bits = vandq_u32(lt, w);
4619    let _ = vst1q_u32; // silence unused import on some toolchains
4620    vaddvq_u32(bits) as u8
4621}
4622
4623#[cfg(all(target_arch = "aarch64", target_endian = "little"))]
4624#[target_feature(enable = "neon")]
4625#[inline]
4626#[allow(clippy::too_many_arguments)]
4627unsafe fn priceset_range_nonabort_neon(
4628    node_prices: &mut [u32],
4629    nodes: &mut [HcOptimalNode],
4630    ml_cache: &mut [[u32; 2]],
4631    ml_stamp: u32,
4632    profile: HcOptimalCostProfile,
4633    stats: &HcOptState,
4634    pos: usize,
4635    start: usize,
4636    max: usize,
4637    ll0_price: u32,
4638    off_price: u32,
4639    base_cost: u32,
4640    off: u32,
4641    reps: [u32; 3],
4642    last_pos: usize,
4643) -> usize {
4644    priceset_range_vec::<4>(
4645        node_prices,
4646        nodes,
4647        ml_cache,
4648        ml_stamp,
4649        profile,
4650        stats,
4651        pos,
4652        start,
4653        max,
4654        ll0_price,
4655        off_price,
4656        base_cost,
4657        off,
4658        reps,
4659        last_pos,
4660        // SAFETY: both closures run inside this fn's neon target_feature umbrella.
4661        |cells, stamp| unsafe { priceset_cached_prices4_neon(cells, stamp) },
4662        |nc, np| unsafe { priceset_improved_mask4_neon(nc, np) },
4663    )
4664}
4665
4666/// SSE4.1 4-lane vector-load + deinterleave of cached ml-prices. Two 128-bit
4667/// loads of `[price, gen]` pairs, `shuffle_epi32(0xD8)` groups prices then gens
4668/// within each, `unpacklo/hi_epi64` separates them. `Some(prices)` only when
4669/// all 4 generations equal `stamp`.
4670#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
4671#[target_feature(enable = "sse4.2")]
4672#[inline]
4673unsafe fn priceset_cached_prices4_sse41(cells: &[[u32; 2]], stamp: u32) -> Option<[u32; 4]> {
4674    #[cfg(target_arch = "x86")]
4675    use core::arch::x86::{
4676        __m128i, _mm_castsi128_ps, _mm_cmpeq_epi32, _mm_loadu_si128, _mm_movemask_ps,
4677        _mm_set1_epi32, _mm_shuffle_epi32, _mm_storeu_si128, _mm_unpackhi_epi64,
4678        _mm_unpacklo_epi64,
4679    };
4680    #[cfg(target_arch = "x86_64")]
4681    use core::arch::x86_64::{
4682        __m128i, _mm_castsi128_ps, _mm_cmpeq_epi32, _mm_loadu_si128, _mm_movemask_ps,
4683        _mm_set1_epi32, _mm_shuffle_epi32, _mm_storeu_si128, _mm_unpackhi_epi64,
4684        _mm_unpacklo_epi64,
4685    };
4686    debug_assert!(cells.len() >= 4);
4687    let base = cells.as_ptr() as *const __m128i;
4688    let v0 = unsafe { _mm_loadu_si128(base) }; // [p0 g0 p1 g1]
4689    let v1 = unsafe { _mm_loadu_si128(base.add(1)) }; // [p2 g2 p3 g3]
4690    let s0 = _mm_shuffle_epi32(v0, 0xD8); // [p0 p1 g0 g1]
4691    let s1 = _mm_shuffle_epi32(v1, 0xD8); // [p2 p3 g2 g3]
4692    let gens = _mm_unpackhi_epi64(s0, s1); // [g0 g1 g2 g3]
4693    let eq = _mm_cmpeq_epi32(gens, _mm_set1_epi32(stamp as i32));
4694    if _mm_movemask_ps(_mm_castsi128_ps(eq)) as u8 & 0x0F != 0x0F {
4695        return None;
4696    }
4697    let prices = _mm_unpacklo_epi64(s0, s1); // [p0 p1 p2 p3]
4698    let mut out = [0u32; 4];
4699    unsafe { _mm_storeu_si128(out.as_mut_ptr() as *mut __m128i, prices) };
4700    Some(out)
4701}
4702
4703/// SSE4.1 4-lane `next_cost < node_price` bitmask (unsigned compare via
4704/// `min_epu32`, like the AVX2 path).
4705#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
4706#[target_feature(enable = "sse4.2")]
4707#[inline]
4708unsafe fn priceset_improved_mask4_sse41(next_cost: &[u32; 4], node_price: &[u32]) -> u8 {
4709    #[cfg(target_arch = "x86")]
4710    use core::arch::x86::{
4711        __m128i, _mm_andnot_si128, _mm_castsi128_ps, _mm_cmpeq_epi32, _mm_loadu_si128,
4712        _mm_min_epu32, _mm_movemask_ps,
4713    };
4714    #[cfg(target_arch = "x86_64")]
4715    use core::arch::x86_64::{
4716        __m128i, _mm_andnot_si128, _mm_castsi128_ps, _mm_cmpeq_epi32, _mm_loadu_si128,
4717        _mm_min_epu32, _mm_movemask_ps,
4718    };
4719    let nc = unsafe { _mm_loadu_si128(next_cost.as_ptr() as *const __m128i) };
4720    let np = unsafe { _mm_loadu_si128(node_price.as_ptr() as *const __m128i) };
4721    let min = _mm_min_epu32(nc, np);
4722    let le = _mm_cmpeq_epi32(min, nc);
4723    let eq = _mm_cmpeq_epi32(nc, np);
4724    let lt = _mm_andnot_si128(eq, le);
4725    (_mm_movemask_ps(_mm_castsi128_ps(lt)) as u8) & 0x0F
4726}
4727
4728#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
4729#[target_feature(enable = "sse4.2")]
4730#[inline]
4731#[allow(clippy::too_many_arguments)]
4732unsafe fn priceset_range_nonabort_sse41(
4733    node_prices: &mut [u32],
4734    nodes: &mut [HcOptimalNode],
4735    ml_cache: &mut [[u32; 2]],
4736    ml_stamp: u32,
4737    profile: HcOptimalCostProfile,
4738    stats: &HcOptState,
4739    pos: usize,
4740    start: usize,
4741    max: usize,
4742    ll0_price: u32,
4743    off_price: u32,
4744    base_cost: u32,
4745    off: u32,
4746    reps: [u32; 3],
4747    last_pos: usize,
4748) -> usize {
4749    priceset_range_vec::<4>(
4750        node_prices,
4751        nodes,
4752        ml_cache,
4753        ml_stamp,
4754        profile,
4755        stats,
4756        pos,
4757        start,
4758        max,
4759        ll0_price,
4760        off_price,
4761        base_cost,
4762        off,
4763        reps,
4764        last_pos,
4765        // SAFETY: both closures run inside this fn's sse4.2 target_feature umbrella.
4766        |cells, stamp| unsafe { priceset_cached_prices4_sse41(cells, stamp) },
4767        |nc, np| unsafe { priceset_improved_mask4_sse41(nc, np) },
4768    )
4769}
4770
4771/// wasm `simd128` 4-lane vector-load + deinterleave of cached ml-prices.
4772/// `u32x4_shuffle` selects the price (even) and gen (odd) lanes across the two
4773/// loaded vectors natively. `Some(prices)` only when all 4 gens equal `stamp`
4774/// (`u32x4_all_true` of the equality vector).
4775#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
4776#[target_feature(enable = "simd128")]
4777#[inline]
4778unsafe fn priceset_cached_prices4_simd128(cells: &[[u32; 2]], stamp: u32) -> Option<[u32; 4]> {
4779    use core::arch::wasm32::{
4780        u32x4_all_true, u32x4_eq, u32x4_shuffle, u32x4_splat, v128, v128_load, v128_store,
4781    };
4782    debug_assert!(cells.len() >= 4);
4783    let base = cells.as_ptr() as *const v128;
4784    let v0 = unsafe { v128_load(base) }; // [p0 g0 p1 g1]
4785    let v1 = unsafe { v128_load(base.add(1)) }; // [p2 g2 p3 g3]
4786    // Lanes 0..3 index v0, 4..7 index v1.
4787    let gens = u32x4_shuffle::<1, 3, 5, 7>(v0, v1); // [g0 g1 g2 g3]
4788    let eq = u32x4_eq(gens, u32x4_splat(stamp));
4789    if !u32x4_all_true(eq) {
4790        return None;
4791    }
4792    let prices = u32x4_shuffle::<0, 2, 4, 6>(v0, v1); // [p0 p1 p2 p3]
4793    let mut out = [0u32; 4];
4794    unsafe { v128_store(out.as_mut_ptr() as *mut v128, prices) };
4795    Some(out)
4796}
4797
4798/// wasm `simd128` 4-lane `next_cost < node_price` bitmask. wasm has a native
4799/// unsigned compare (`u32x4_lt`) and `u32x4_bitmask` to pack the lanes.
4800#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
4801#[target_feature(enable = "simd128")]
4802#[inline]
4803unsafe fn priceset_improved_mask4_simd128(next_cost: &[u32; 4], node_price: &[u32]) -> u8 {
4804    use core::arch::wasm32::{u32x4_bitmask, u32x4_lt, v128, v128_load};
4805    let nc = unsafe { v128_load(next_cost.as_ptr() as *const v128) };
4806    let np = unsafe { v128_load(node_price.as_ptr() as *const v128) };
4807    u32x4_bitmask(u32x4_lt(nc, np))
4808}
4809
4810#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
4811#[target_feature(enable = "simd128")]
4812#[inline]
4813#[allow(clippy::too_many_arguments)]
4814unsafe fn priceset_range_nonabort_simd128(
4815    node_prices: &mut [u32],
4816    nodes: &mut [HcOptimalNode],
4817    ml_cache: &mut [[u32; 2]],
4818    ml_stamp: u32,
4819    profile: HcOptimalCostProfile,
4820    stats: &HcOptState,
4821    pos: usize,
4822    start: usize,
4823    max: usize,
4824    ll0_price: u32,
4825    off_price: u32,
4826    base_cost: u32,
4827    off: u32,
4828    reps: [u32; 3],
4829    last_pos: usize,
4830) -> usize {
4831    priceset_range_vec::<4>(
4832        node_prices,
4833        nodes,
4834        ml_cache,
4835        ml_stamp,
4836        profile,
4837        stats,
4838        pos,
4839        start,
4840        max,
4841        ll0_price,
4842        off_price,
4843        base_cost,
4844        off,
4845        reps,
4846        last_pos,
4847        // SAFETY: both closures run inside this fn's simd128 target_feature umbrella.
4848        |cells, stamp| unsafe { priceset_cached_prices4_simd128(cells, stamp) },
4849        |nc, np| unsafe { priceset_improved_mask4_simd128(nc, np) },
4850    )
4851}
4852
4853macro_rules! build_optimal_plan_impl_body {
4854    (
4855        $self:expr,
4856        $strategy_ty:ty,
4857        $current:ident,
4858        $current_abs_start:ident,
4859        $current_len:ident,
4860        $initial_state:ident,
4861        $stats:ident,
4862        $out:ident,
4863        $collect:ident,
4864        $priceset:path $(,)?
4865    ) => {{
4866        let current_abs_end = $current_abs_start + $current_len;
4867        let min_match_len = HC_OPT_MIN_MATCH_LEN;
4868        // `HC_OPT_NUM > 0` by const definition, so `HC_OPT_NUM - 1` is safe.
4869        let frontier_limit = $current_len.min(HC_OPT_NUM - 1);
4870        let initial_reps = $initial_state.reps;
4871        let initial_litlen = $initial_state.litlen;
4872        let ldm_block_offset = $initial_state.block_offset;
4873        let mut profile = $initial_state.profile;
4874        profile.sufficient_match_len = $self.hc.sufficient_match_len_for_pass(profile);
4875        // Const-fold from the strategy's associated `OPT_LEVEL`
4876        // (upstream zstd `optLevel`): BtOpt = 0, BtUltra / BtUltra2 = 2.
4877        // The two flags below are the only places the inner DP loop
4878        // used to consult `parse_mode`; lifting them into const
4879        // expressions drops one indirect read + one branch on every
4880        // candidate insertion and every traceback step.
4881        // `let` (not `const`) — nested `const` items inside a
4882        // generic fn cannot project through the outer fn's type
4883        // parameter, but a `let` binding from a const expression
4884        // does get folded by the optimiser per monomorphisation,
4885        // which is what we actually want here.
4886        debug_assert!(
4887            <$strategy_ty as super::strategy::Strategy>::USE_BT,
4888            "build_optimal_plan_impl_body called on non-BT strategy"
4889        );
4890        let abort_on_worse_match: bool =
4891            <$strategy_ty as super::strategy::Strategy>::OPT_LEVEL == 0;
4892        let opt_level: bool = <$strategy_ty as super::strategy::Strategy>::OPT_LEVEL >= 2;
4893        let mut nodes = core::mem::take(&mut $self.backend.bt_mut().opt_nodes_scratch);
4894        let mut node_prices = core::mem::take(&mut $self.backend.bt_mut().opt_node_prices_scratch);
4895        // `frontier_limit + 2 <= HC_OPT_NODE_LEN` — bounded by const.
4896        let frontier_buffer_size = frontier_limit + 2;
4897        if nodes.len() < HC_OPT_NODE_LEN {
4898            // First optimal-parse use (empty boxed slice) or an undersized
4899            // buffer: allocate the fixed upstream-zstd-sized frontier once. The DP
4900            // overwrites the active prefix before reading it.
4901            nodes = alloc::vec![HcOptimalNode::default(); HC_OPT_NODE_LEN].into_boxed_slice();
4902        }
4903        // The DP price array, same fixed length as `nodes`. This is the SOLE
4904        // home of each position's price (the node struct carries no price), so
4905        // the SIMD price-set vector-loads it directly. Initialised to u32::MAX
4906        // so unwritten frontier cells compare as "unreachable".
4907        if node_prices.len() < HC_OPT_NODE_LEN {
4908            node_prices = alloc::vec![u32::MAX; HC_OPT_NODE_LEN].into_boxed_slice();
4909        }
4910        let mut candidates = core::mem::take(&mut $self.backend.bt_mut().opt_candidates_scratch);
4911        candidates.clear();
4912        if candidates.capacity() < MAX_HC_SEARCH_DEPTH {
4913            candidates.reserve_exact(MAX_HC_SEARCH_DEPTH - candidates.capacity());
4914        }
4915        let mut store = core::mem::take(&mut $self.backend.bt_mut().opt_store_scratch);
4916        store.clear();
4917        let mut price_arena = core::mem::take(&mut $self.backend.bt_mut().opt_price_arena);
4918        if price_arena.len() < HC_OPT_PRICE_ARENA_LEN {
4919            price_arena = alloc::vec![[0u32; 2]; HC_OPT_PRICE_ARENA_LEN].into_boxed_slice();
4920        }
4921        // Single arena → two disjoint fixed-stride regions of `[price,
4922        // generation]` pairs (LL cache, ML cache): one base pointer + fixed
4923        // offsets, mirroring upstream zstd's single opt workspace. Pairing
4924        // price+generation per code keeps the optimal parser's cache probe
4925        // on ONE line instead of two strided regions.
4926        // SAFETY: `price_arena` is exactly `HC_OPT_PRICE_ARENA_LEN =
4927        // 2 * HC_OPT_PRICE_STRIDE` pairs long (just ensured), so the two
4928        // STRIDE-wide regions are in bounds and disjoint. The slices alias
4929        // the heap buffer `price_arena` owns; that heap address is stable
4930        // across the later move of the `price_arena` box into the result
4931        // bundle (a `Box` move relocates only the pointer, not the heap
4932        // data), and the slices are never used after the bundle is
4933        // constructed. The fixed STRIDE (independent of `frontier_limit`)
4934        // keeps every code's cell at a constant offset so the monotonic
4935        // stamps stay valid across calls with different frontiers.
4936        let arena_base = price_arena.as_mut_ptr();
4937        let mut ll_cache: &mut [[u32; 2]] =
4938            unsafe { core::slice::from_raw_parts_mut(arena_base, HC_OPT_PRICE_STRIDE) };
4939        let mut ml_cache: &mut [[u32; 2]] = unsafe {
4940            core::slice::from_raw_parts_mut(arena_base.add(HC_OPT_PRICE_STRIDE), HC_OPT_PRICE_STRIDE)
4941        };
4942        $self.backend.bt_mut().opt_ll_price_stamp = $self
4943            .backend
4944            .bt_mut()
4945            .opt_ll_price_stamp
4946            .wrapping_add(1)
4947            .max(1);
4948        let ll_price_stamp = $self.backend.bt_mut().opt_ll_price_stamp;
4949        $self.backend.bt_mut().opt_lit_price_stamp = $self
4950            .backend
4951            .bt_mut()
4952            .opt_lit_price_stamp
4953            .wrapping_add(1)
4954            .max(1);
4955        let lit_price_stamp = $self.backend.bt_mut().opt_lit_price_stamp;
4956        $self.backend.bt_mut().opt_ml_price_stamp = $self
4957            .backend
4958            .bt_mut()
4959            .opt_ml_price_stamp
4960            .wrapping_add(1)
4961            .max(1);
4962        let ml_price_stamp = $self.backend.bt_mut().opt_ml_price_stamp;
4963        let node0_price = BtMatcher::cached_lit_length_price(
4964            profile,
4965            $stats,
4966            initial_litlen,
4967            &mut ll_cache,
4968            ll_price_stamp,
4969        );
4970        nodes[0] = HcOptimalNode {
4971            litlen: initial_litlen as u32,
4972            reps: initial_reps,
4973            ..HcOptimalNode::default()
4974        };
4975        node_prices[0] = node0_price;
4976        let sufficient_len = profile.sufficient_match_len;
4977        let ll0_price = BtMatcher::cached_lit_length_price(
4978            profile,
4979            $stats,
4980            0,
4981            &mut ll_cache,
4982            ll_price_stamp,
4983        );
4984        let ll1_price = BtMatcher::cached_lit_length_price(
4985            profile,
4986            $stats,
4987            1,
4988            &mut ll_cache,
4989            ll_price_stamp,
4990        );
4991        let mut pos = 1usize;
4992        let mut last_pos = 0usize;
4993        let mut forced_end: Option<usize> = None;
4994        let mut forced_end_state: Option<HcOptimalNode> = None;
4995        // Price companion of `forced_end_state` (price no longer lives in the
4996        // node struct; tracked alongside the forced-seed node).
4997        let mut forced_end_price: Option<u32> = None;
4998        let mut seed_forced_shortest_path = false;
4999        let mut opt_ldm = HcOptLdmState {
5000            seq_store: HcRawSeqStore {
5001                pos: 0,
5002                pos_in_sequence: 0,
5003                size: $self.backend.bt_mut().ldm_sequences.len(),
5004            },
5005            ..HcOptLdmState::default()
5006        };
5007        let has_ldm = !$self.backend.bt_mut().ldm_sequences.is_empty();
5008        if has_ldm {
5009            // `ldm_sequences` are emitted in BLOCK-relative coordinates,
5010            // but this optimal-parser pass runs over a SEGMENT of the
5011            // block starting at block-offset `$block_offset` and uses
5012            // segment-relative positions throughout. Fast-forward the raw
5013            // seq-store cursor past the bytes covered by earlier segments
5014            // so the (segment-relative) LDM windows below land at the
5015            // correct positions. Idempotent: `ldm_skip_raw_seq_store_bytes`
5016            // recomputes from `pos = 0`, so re-running it per segment is
5017            // safe. Without this, every segment after the first re-applied
5018            // the block's leading LDM windows at the wrong offset, emitting
5019            // matches that copy the wrong bytes (undecodable frame).
5020            if ldm_block_offset > 0 {
5021                $self
5022                    .backend
5023                    .bt_mut()
5024                    .ldm_skip_raw_seq_store_bytes(&mut opt_ldm.seq_store, ldm_block_offset);
5025            }
5026            $self
5027                .backend
5028                .bt_mut()
5029                .ldm_get_next_match_and_update_seq_store(&mut opt_ldm, 0, $current_len);
5030        }
5031
5032        // Upstream zstd-like seed at rPos=0: initialize frontier with matches starting
5033        // at current position before entering the generic forward DP loop.
5034        if $current_len >= min_match_len {
5035            let seed_ldm = if has_ldm {
5036                $self.backend.bt_mut().ldm_process_match_candidate(
5037                    &mut opt_ldm,
5038                    0,
5039                    $current_len,
5040                    min_match_len,
5041                )
5042            } else {
5043                None
5044            };
5045            candidates.clear();
5046            // SAFETY: wrapper is in the same target_feature umbrella as the
5047            // `$collect` kernel variant; the runtime kernel detector already
5048            // gated entry into the wrapper.
5049            unsafe {
5050                $self.$collect::<$strategy_ty, true>(
5051                    $current_abs_start,
5052                    current_abs_end,
5053                    profile,
5054                    HcCandidateQuery {
5055                        reps: initial_reps,
5056                        lit_len: initial_litlen,
5057                        ldm_candidate: seed_ldm,
5058                    },
5059                    &mut candidates,
5060                )
5061            };
5062            if !candidates.is_empty() {
5063                // `min_match_len >= HC_FORMAT_MINMATCH (3)` by invariant.
5064                last_pos = (min_match_len - 1).min(frontier_limit);
5065                for p in 1..min_match_len.min(frontier_buffer_size) {
5066                    BtMatcher::reset_opt_node(&mut nodes[p]);
5067                    // Reset the price (sole home; the node carries none).
5068                    node_prices[p] = u32::MAX;
5069                    // `initial_litlen` is the litlen carried from prior
5070                    // optimal-plan segments — its real bound is the
5071                    // current block length (the frame compressor caps
5072                    // block scan at `HC_BLOCKSIZE_MAX`), not the segment
5073                    // `current_len`. `p < min_match_len` (small constant),
5074                    // so the sum stays well within `u32::MAX`. Use
5075                    // `checked_add` FIRST so the `usize` addition itself
5076                    // cannot overflow on i686 (where `usize` is 32-bit
5077                    // and a wrapping `+` would slip past `try_from`).
5078                    let seed_litlen = initial_litlen
5079                        .checked_add(p)
5080                        .and_then(|s| u32::try_from(s).ok())
5081                        .expect("optimal parser seed litlen out of u32 range");
5082                    nodes[p].litlen = seed_litlen;
5083                }
5084            }
5085
5086            if let Some(candidate) = candidates.last() {
5087                let longest_len = candidate.match_len.min($current_len);
5088                if longest_len > sufficient_len {
5089                    let off_base = BtMatcher::encode_offset_base_with_reps(
5090                        candidate.offset as u32,
5091                        initial_litlen,
5092                        initial_reps,
5093                    );
5094                    let off_price = profile
5095                        .offset_price_for::<ACCURATE_PRICE, FAVOR_SMALL_OFFSETS>($stats, off_base);
5096                    let ml_price = BtMatcher::cached_match_length_price(
5097                        profile,
5098                        $stats,
5099                        longest_len,
5100                        &mut ml_cache,
5101                        ml_price_stamp,
5102                    );
5103                    let seq_cost = BtMatcher::add_prices(
5104                        ll0_price,
5105                        profile.match_price_from_parts(off_price, ml_price, $stats),
5106                    );
5107                    let forced_price = BtMatcher::add_prices(node_prices[0], seq_cost);
5108                    let forced_state = HcOptimalNode {
5109                        off: candidate.offset as u32,
5110                        mlen: longest_len as u32,
5111                        litlen: 0,
5112                        reps: initial_reps,
5113                    };
5114                    if longest_len < frontier_buffer_size && forced_price < node_prices[longest_len] {
5115                        nodes[longest_len] = forced_state;
5116                        node_prices[longest_len] = forced_price;
5117                    }
5118                    forced_end = Some(longest_len);
5119                    forced_end_state = Some(forced_state);
5120                    forced_end_price = Some(forced_price);
5121                    seed_forced_shortest_path = true;
5122                }
5123            }
5124            if !seed_forced_shortest_path {
5125                let mut prev_max_len = min_match_len - 1;
5126                for candidate in candidates.iter() {
5127                    let max_match_len = candidate.match_len.min(frontier_limit);
5128                    if max_match_len < min_match_len {
5129                        continue;
5130                    }
5131                    let start_len = (prev_max_len + 1).max(min_match_len);
5132                    if start_len > max_match_len {
5133                        prev_max_len = prev_max_len.max(max_match_len);
5134                        continue;
5135                    }
5136                    if max_match_len > last_pos {
5137                        BtMatcher::reset_opt_nodes(
5138                            &mut nodes,
5139                            &mut node_prices,
5140                            last_pos + 1,
5141                            max_match_len,
5142                        );
5143                    }
5144                    let off_base = BtMatcher::encode_offset_base_with_reps(
5145                        candidate.offset as u32,
5146                        initial_litlen,
5147                        initial_reps,
5148                    );
5149                    let off_price = profile
5150                        .offset_price_for::<ACCURATE_PRICE, FAVOR_SMALL_OFFSETS>($stats, off_base);
5151                    debug_assert!(max_match_len < frontier_buffer_size);
5152                    let nodes0_price = node_prices[0];
5153                    for match_len in (start_len..=max_match_len).rev() {
5154                        let ml_price = BtMatcher::cached_match_length_price(
5155                            profile,
5156                            $stats,
5157                            match_len,
5158                            &mut ml_cache,
5159                            ml_price_stamp,
5160                        );
5161                        let seq_cost = BtMatcher::add_prices(
5162                            ll0_price,
5163                            profile.match_price_from_parts(off_price, ml_price, $stats),
5164                        );
5165                        let next_cost = BtMatcher::add_prices(nodes0_price, seq_cost);
5166                        let node_price = unsafe { *node_prices.get_unchecked(match_len) };
5167                        if match_len > last_pos || next_cost < node_price {
5168                            let slot = unsafe { nodes.get_unchecked_mut(match_len) };
5169                            *slot = HcOptimalNode {
5170                                off: candidate.offset as u32,
5171                                mlen: match_len as u32,
5172                                litlen: 0,
5173                                reps: initial_reps,
5174                            };
5175                            unsafe { *node_prices.get_unchecked_mut(match_len) = next_cost };
5176                            if match_len > last_pos {
5177                                last_pos = match_len;
5178                            }
5179                        } else if abort_on_worse_match {
5180                            break;
5181                        }
5182                    }
5183                    prev_max_len = prev_max_len.max(max_match_len);
5184                }
5185                if last_pos + 1 < frontier_buffer_size {
5186                    node_prices[last_pos + 1] = u32::MAX;
5187                }
5188            }
5189        }
5190        while !seed_forced_shortest_path && pos <= last_pos && pos <= frontier_limit {
5191            debug_assert!(pos + 1 < frontier_buffer_size);
5192            let prev_node = unsafe { *nodes.get_unchecked(pos - 1) };
5193            let prev_node_price = unsafe { *node_prices.get_unchecked(pos - 1) };
5194            if prev_node_price != u32::MAX {
5195                let lit_len = prev_node.litlen as usize + 1;
5196                let lit_price = {
5197                    let bt = $self.backend.bt_mut();
5198                    BtMatcher::cached_literal_price(
5199                        profile,
5200                        $stats,
5201                        $current[pos - 1],
5202                        &mut bt.opt_lit_price_scratch,
5203                        &mut bt.opt_lit_price_generation,
5204                        lit_price_stamp,
5205                    )
5206                };
5207                let ll_delta = BtMatcher::cached_lit_length_delta_price(
5208                    profile,
5209                    $stats,
5210                    lit_len,
5211                    &mut ll_cache,
5212                    ll_price_stamp,
5213                );
5214                let lit_cost = BtMatcher::add_price_delta(prev_node_price, lit_price, ll_delta);
5215                // `node_pos_price` is the OLD price at `pos` (before the write
5216                // below) — also the price of `prev_match`, the pre-overwrite copy.
5217                let node_pos_price = unsafe { *node_prices.get_unchecked(pos) };
5218                if lit_cost <= node_pos_price {
5219                    let prev_match = unsafe { *nodes.get_unchecked(pos) };
5220                    let slot = unsafe { nodes.get_unchecked_mut(pos) };
5221                    *slot = prev_node;
5222                    slot.litlen = lit_len as u32;
5223                    node_prices[pos] = lit_cost;
5224                    #[allow(clippy::collapsible_if)]
5225                    if opt_level
5226                        && prev_match.mlen > 0
5227                        && prev_match.litlen == 0
5228                        && pos < $current_len
5229                    {
5230                        if ll1_price < ll0_price {
5231                            let next_lit_price = {
5232                                let bt = $self.backend.bt_mut();
5233                                BtMatcher::cached_literal_price(
5234                                    profile,
5235                                    $stats,
5236                                    $current[pos],
5237                                    &mut bt.opt_lit_price_scratch,
5238                                    &mut bt.opt_lit_price_generation,
5239                                    lit_price_stamp,
5240                                )
5241                            };
5242                            let with1literal = BtMatcher::add_price_delta(
5243                                node_pos_price,
5244                                next_lit_price,
5245                                ll1_price as i32 - ll0_price as i32,
5246                            );
5247                            let ll_delta_next = BtMatcher::cached_lit_length_delta_price(
5248                                profile,
5249                                $stats,
5250                                lit_len + 1,
5251                                &mut ll_cache,
5252                                ll_price_stamp,
5253                            );
5254                            let with_more_literals =
5255                                BtMatcher::add_price_delta(lit_cost, next_lit_price, ll_delta_next);
5256                            let next = pos + 1;
5257                            let next_price = unsafe { *node_prices.get_unchecked(next) };
5258                            if with1literal < with_more_literals && with1literal < next_price {
5259                                // Upstream zstd parity (zstd_opt.c:1232): `cur >= prevMatch.mlen`.
5260                                debug_assert!(pos >= prev_match.mlen as usize);
5261                                let prev_pos = pos - prev_match.mlen as usize;
5262                                {
5263                                    let prev_state = unsafe { *nodes.get_unchecked(prev_pos) };
5264                                    let (_, reps_after_match) = BtMatcher::encode_offset_with_reps(
5265                                        prev_match.off,
5266                                        prev_state.litlen as usize,
5267                                        prev_state.reps,
5268                                    );
5269                                    let slot = unsafe { nodes.get_unchecked_mut(next) };
5270                                    *slot = prev_match;
5271                                    slot.reps = reps_after_match;
5272                                    slot.litlen = 1;
5273                                    node_prices[next] = with1literal;
5274                                    if next > last_pos {
5275                                        last_pos = next;
5276                                    }
5277                                }
5278                            }
5279                        }
5280                    }
5281                }
5282            }
5283
5284            // Memory-resident DP (upstream zstd parity): read opt[cur] fields on
5285            // demand instead of holding a 28-byte node copy live across the
5286            // per-position `$collect` call below. The held copy forced LLVM
5287            // to spill reps[3] + litlen around the (non-inlinable) call;
5288            // reading the fields fresh on each side keeps them out of the
5289            // cross-call live set. `nodes[pos]` is stable across `$collect`
5290            // (it only fills `candidates`), so post-call reads are identical.
5291            let base_cost = unsafe { *node_prices.get_unchecked(pos) };
5292            if base_cost == u32::MAX {
5293                pos += 1;
5294                continue;
5295            }
5296            {
5297                let base_node = unsafe { *nodes.get_unchecked(pos) };
5298                if base_node.mlen > 0 && base_node.litlen == 0 {
5299                    // Upstream zstd parity (zstd_opt.c:1255): `cur >= opt[cur].mlen`.
5300                    debug_assert!(pos >= base_node.mlen as usize);
5301                    let prev_pos = pos - base_node.mlen as usize;
5302                    let prev_state = unsafe { *nodes.get_unchecked(prev_pos) };
5303                    let (_, reps_after_match) = BtMatcher::encode_offset_with_reps(
5304                        base_node.off,
5305                        prev_state.litlen as usize,
5306                        prev_state.reps,
5307                    );
5308                    unsafe { nodes.get_unchecked_mut(pos).reps = reps_after_match };
5309                }
5310            }
5311
5312            if pos + 8 > $current_len {
5313                pos += 1;
5314                continue;
5315            }
5316
5317            if pos == last_pos {
5318                break;
5319            }
5320
5321            let next_price = unsafe { *node_prices.get_unchecked(pos + 1) };
5322            // `saturating_add` is REQUIRED here, not a masked bug: `base_cost`
5323            // is a node price that can be the `u32::MAX` "unreachable" sentinel,
5324            // and saturating keeps `base_cost + margin` pinned at MAX so the
5325            // comparison stays correct. Plain `+` would wrap the sentinel and
5326            // flip the abort decision (a ratio bug / debug overflow panic).
5327            if abort_on_worse_match
5328                && next_price <= base_cost.saturating_add(HC_BITCOST_MULTIPLIER / 2)
5329            {
5330                pos += 1;
5331                continue;
5332            }
5333
5334            let abs_pos = $current_abs_start + pos;
5335            let ldm_candidate = if has_ldm {
5336                $self.backend.bt_mut().ldm_process_match_candidate(
5337                    &mut opt_ldm,
5338                    pos,
5339                    $current_len - pos,
5340                    min_match_len,
5341                )
5342            } else {
5343                None
5344            };
5345            candidates.clear();
5346            // SAFETY: same umbrella as `$collect`. Query fields are read
5347            // fresh here (consumed into the call's argument) so they do not
5348            // stay live across the call; the post-call reads below are a
5349            // separate, fresh load of the same stable `nodes[pos]`.
5350            unsafe {
5351                $self.$collect::<$strategy_ty, true>(
5352                    abs_pos,
5353                    current_abs_end,
5354                    profile,
5355                    HcCandidateQuery {
5356                        reps: nodes.get_unchecked(pos).reps,
5357                        lit_len: nodes.get_unchecked(pos).litlen as usize,
5358                        ldm_candidate,
5359                    },
5360                    &mut candidates,
5361                )
5362            };
5363            // Post-call reads of opt[cur]: fresh, born after `$collect`, so
5364            // never part of the cross-call live set (see memory-resident note
5365            // above). `nodes[pos]` is untouched by `$collect`.
5366            let base_reps = unsafe { nodes.get_unchecked(pos).reps };
5367            let base_litlen = unsafe { nodes.get_unchecked(pos).litlen as usize };
5368            if let Some(candidate) = candidates.last() {
5369                let longest_len = candidate.match_len.min($current_len - pos);
5370                if longest_len > sufficient_len
5371                    || pos + longest_len >= HC_OPT_NUM
5372                    || pos + longest_len >= $current_len
5373                {
5374                    let lit_len = base_litlen;
5375                    let off_base = BtMatcher::encode_offset_base_with_reps(
5376                        candidate.offset as u32,
5377                        lit_len,
5378                        base_reps,
5379                    );
5380                    let off_price = profile
5381                        .offset_price_for::<ACCURATE_PRICE, FAVOR_SMALL_OFFSETS>($stats, off_base);
5382                    let ml_price = BtMatcher::cached_match_length_price(
5383                        profile,
5384                        $stats,
5385                        longest_len,
5386                        &mut ml_cache,
5387                        ml_price_stamp,
5388                    );
5389                    let seq_cost = BtMatcher::add_prices(
5390                        ll0_price,
5391                        profile.match_price_from_parts(off_price, ml_price, $stats),
5392                    );
5393                    let forced_price = BtMatcher::add_prices(base_cost, seq_cost);
5394                    let end_pos = (pos + longest_len).min($current_len);
5395                    forced_end = Some(end_pos);
5396                    forced_end_state = Some(HcOptimalNode {
5397                        off: candidate.offset as u32,
5398                        mlen: longest_len as u32,
5399                        litlen: 0,
5400                        reps: base_reps,
5401                    });
5402                    forced_end_price = Some(forced_price);
5403                    break;
5404                }
5405            }
5406            let mut prev_max_len = min_match_len - 1;
5407            for candidate in candidates.iter() {
5408                // Outer loop guards `pos <= frontier_limit` (see the
5409                // `while ... pos <= frontier_limit` condition); the
5410                // subtraction below is therefore safe.
5411                debug_assert!(pos <= frontier_limit);
5412                let max_match_len = candidate
5413                    .match_len
5414                    .min($current_len - pos)
5415                    .min(frontier_limit - pos);
5416                let min_len = min_match_len;
5417                if max_match_len < min_len {
5418                    continue;
5419                }
5420                let start_len = (prev_max_len + 1).max(min_len);
5421                if start_len > max_match_len {
5422                    prev_max_len = prev_max_len.max(max_match_len);
5423                    continue;
5424                }
5425                let max_next = pos + max_match_len;
5426                if max_next > last_pos {
5427                    BtMatcher::reset_opt_nodes(
5428                        &mut nodes,
5429                        &mut node_prices,
5430                        last_pos + 1,
5431                        max_next,
5432                    );
5433                }
5434                let lit_len = base_litlen;
5435                let off_base = BtMatcher::encode_offset_base_with_reps(
5436                    candidate.offset as u32,
5437                    lit_len,
5438                    base_reps,
5439                );
5440                let off_price = profile
5441                    .offset_price_for::<ACCURATE_PRICE, FAVOR_SMALL_OFFSETS>($stats, off_base);
5442                debug_assert!(pos + max_match_len < frontier_buffer_size);
5443                if abort_on_worse_match {
5444                    // btopt (OPT_LEVEL == 0): reverse-iterate with early break —
5445                    // once a longer match stops improving, shorter ones are
5446                    // skipped. Order-dependent, stays scalar.
5447                    for match_len in (start_len..=max_match_len).rev() {
5448                        let next = pos + match_len;
5449                        let ml_price = BtMatcher::cached_match_length_price(
5450                            profile,
5451                            $stats,
5452                            match_len,
5453                            &mut ml_cache,
5454                            ml_price_stamp,
5455                        );
5456                        let seq_cost = BtMatcher::add_prices(
5457                            ll0_price,
5458                            profile.match_price_from_parts(off_price, ml_price, $stats),
5459                        );
5460                        let next_cost = BtMatcher::add_prices(base_cost, seq_cost);
5461                        let node_next_price = unsafe { *node_prices.get_unchecked(next) };
5462                        if next > last_pos || next_cost < node_next_price {
5463                            let slot = unsafe { nodes.get_unchecked_mut(next) };
5464                            *slot = HcOptimalNode {
5465                                off: candidate.offset as u32,
5466                                mlen: match_len as u32,
5467                                litlen: 0,
5468                                reps: base_reps,
5469                            };
5470                            unsafe { *node_prices.get_unchecked_mut(next) = next_cost };
5471                            if next > last_pos {
5472                                last_pos = next;
5473                            }
5474                        } else {
5475                            break;
5476                        }
5477                    }
5478                } else {
5479                    // btultra / btultra2 (OPT_LEVEL >= 2): no abort, each
5480                    // match_len writes a distinct node => order-independent.
5481                    // Dispatch to the per-tier price-set ($priceset is the
5482                    // tier's fn: AVX2 SoA-vector compare for the avx2 wrapper,
5483                    // inline scalar otherwise) — it folds into this wrapper's
5484                    // monomorphisation, so no call ABI / runtime feature check.
5485                    #[allow(unused_unsafe)]
5486                    {
5487                        last_pos = last_pos.max(unsafe {
5488                            $priceset(
5489                                &mut node_prices,
5490                                &mut nodes,
5491                                ml_cache,
5492                                ml_price_stamp,
5493                                profile,
5494                                $stats,
5495                                pos,
5496                                start_len,
5497                                max_match_len,
5498                                ll0_price,
5499                                off_price,
5500                                base_cost,
5501                                candidate.offset as u32,
5502                                base_reps,
5503                                last_pos,
5504                            )
5505                        });
5506                    }
5507                }
5508                prev_max_len = prev_max_len.max(max_match_len);
5509            }
5510
5511            if last_pos + 1 < frontier_buffer_size {
5512                unsafe {
5513                    *node_prices.get_unchecked_mut(last_pos + 1) = u32::MAX;
5514                }
5515            }
5516            pos += 1;
5517        }
5518
5519        if last_pos == 0 {
5520            if $current_len == 0 {
5521                let price = node_prices[0];
5522                return $self.backend.bt_mut().finish_optimal_plan(
5523                    HcOptimalPlanBuffers {
5524                        nodes,
5525                        node_prices,
5526                        candidates,
5527                        store,
5528                        price_arena,
5529                    },
5530                    (price, initial_reps, initial_litlen, 0),
5531                );
5532            }
5533            let lit_price = {
5534                let bt = $self.backend.bt_mut();
5535                BtMatcher::cached_literal_price(
5536                    profile,
5537                    $stats,
5538                    $current[0],
5539                    &mut bt.opt_lit_price_scratch,
5540                    &mut bt.opt_lit_price_generation,
5541                    lit_price_stamp,
5542                )
5543            };
5544            // `initial_litlen` is carried across optimal-plan segments;
5545            // its real bound is the current block length, not
5546            // `current_len`. On i686 (32-bit `usize`) `+ 1` could
5547            // theoretically wrap if the invariant ever broke. Catch
5548            // that explicitly via `checked_add` rather than letting a
5549            // wrapping sum slip into the price lookup.
5550            let next_litlen = initial_litlen
5551                .checked_add(1)
5552                .expect("optimal parser next litlen out of usize range");
5553            let ll_delta = BtMatcher::cached_lit_length_delta_price(
5554                profile,
5555                $stats,
5556                next_litlen,
5557                &mut ll_cache,
5558                ll_price_stamp,
5559            );
5560            let price = BtMatcher::add_price_delta(node_prices[0], lit_price, ll_delta);
5561            return $self.backend.bt_mut().finish_optimal_plan(
5562                HcOptimalPlanBuffers {
5563                    nodes,
5564                    node_prices,
5565                    candidates,
5566                    store,
5567                    price_arena,
5568                },
5569                (price, initial_reps, next_litlen, 1),
5570            );
5571        }
5572
5573        let target_pos = forced_end.unwrap_or(last_pos.min(frontier_limit));
5574        // Price lives in `node_prices`, not the node struct, so carry the
5575        // final-stretch price alongside its node (forced-seed companion or the
5576        // frontier price at `target_pos`).
5577        let (last_stretch, last_stretch_price) = if let Some(forced_state) = forced_end_state {
5578            (forced_state, forced_end_price.expect("forced state has a price"))
5579        } else {
5580            (nodes[target_pos], node_prices[target_pos])
5581        };
5582        if last_stretch_price == u32::MAX {
5583            return $self.backend.bt_mut().finish_optimal_plan(
5584                HcOptimalPlanBuffers {
5585                    nodes,
5586                    node_prices,
5587                    candidates,
5588                    store,
5589                    price_arena,
5590                },
5591                (u32::MAX, initial_reps, initial_litlen, $current_len),
5592            );
5593        }
5594
5595        if last_stretch.mlen == 0 {
5596            return $self.backend.bt_mut().finish_optimal_plan(
5597                HcOptimalPlanBuffers {
5598                    nodes,
5599                    node_prices,
5600                    candidates,
5601                    store,
5602                    price_arena,
5603                },
5604                (
5605                    last_stretch_price,
5606                    last_stretch.reps,
5607                    last_stretch.litlen as usize,
5608                    target_pos.min($current_len),
5609                ),
5610            );
5611        }
5612
5613        let mut cur = target_pos.saturating_sub(last_stretch.mlen as usize);
5614        let end_reps = if last_stretch.litlen == 0 {
5615            let prev_state = nodes[cur];
5616            let (_, reps_after_match) = BtMatcher::encode_offset_with_reps(
5617                last_stretch.off,
5618                prev_state.litlen as usize,
5619                prev_state.reps,
5620            );
5621            reps_after_match
5622        } else {
5623            let tail_literals = last_stretch.litlen as usize;
5624            if cur < tail_literals {
5625                return $self.backend.bt_mut().finish_optimal_plan(
5626                    HcOptimalPlanBuffers {
5627                        nodes,
5628                        node_prices,
5629                        candidates,
5630                        store,
5631                        price_arena,
5632                    },
5633                    (
5634                        last_stretch_price,
5635                        last_stretch.reps,
5636                        tail_literals,
5637                        target_pos.min($current_len),
5638                    ),
5639                );
5640            }
5641            cur -= tail_literals;
5642            last_stretch.reps
5643        };
5644        let store_end = cur + 2;
5645        if store.len() <= store_end {
5646            store.resize(store_end + 1, HcOptimalNode::default());
5647        }
5648        let mut store_start;
5649        let mut stretch_pos = cur;
5650
5651        if last_stretch.litlen > 0 {
5652            store[store_end] = HcOptimalNode {
5653                litlen: last_stretch.litlen,
5654                mlen: 0,
5655                ..HcOptimalNode::default()
5656            };
5657            store_start = store_end.saturating_sub(1);
5658            store[store_start] = last_stretch;
5659        }
5660        store[store_end] = last_stretch;
5661        store_start = store_end;
5662
5663        loop {
5664            let next_stretch = nodes[stretch_pos];
5665            store[store_start].litlen = next_stretch.litlen;
5666            if next_stretch.mlen == 0 {
5667                break;
5668            }
5669            if store_start == 0 {
5670                break;
5671            }
5672            store_start -= 1;
5673            store[store_start] = next_stretch;
5674            // Parser invariant: every emitted stretch is bounded by the
5675            // current block, so `litlen + mlen <= current_len <=
5676            // HC_BLOCKSIZE_MAX (128 KiB)`. The `as usize` widening + raw
5677            // `+` is safe on 32-bit targets — two u32 values do NOT
5678            // automatically fit in `usize` on i686, the block bound is
5679            // what makes this addition safe.
5680            let litlen = next_stretch.litlen as usize;
5681            let mlen = next_stretch.mlen as usize;
5682            debug_assert!(litlen + mlen <= $current_len);
5683            let step = litlen + mlen;
5684            if step == 0 || stretch_pos < step {
5685                break;
5686            }
5687            stretch_pos -= step;
5688        }
5689
5690        let mut tail_literals = initial_litlen;
5691        let mut store_pos = store_start;
5692        while store_pos <= store_end {
5693            let stretch = store[store_pos];
5694            let llen = stretch.litlen as usize;
5695            let mlen = stretch.mlen as usize;
5696            if mlen == 0 {
5697                tail_literals = llen;
5698                store_pos += 1;
5699                continue;
5700            }
5701            $out.push(HcOptimalSequence {
5702                offset: stretch.off,
5703                match_len: mlen as u32,
5704                lit_len: llen as u32,
5705            });
5706            tail_literals = 0;
5707            store_pos += 1;
5708        }
5709        let result = (
5710            last_stretch_price,
5711            end_reps,
5712            if last_stretch.litlen > 0 {
5713                last_stretch.litlen as usize
5714            } else {
5715                tail_literals
5716            },
5717            target_pos.min($current_len),
5718        );
5719        $self.backend.bt_mut().finish_optimal_plan(
5720            HcOptimalPlanBuffers {
5721                nodes,
5722                node_prices,
5723                candidates,
5724                store,
5725                price_arena,
5726            },
5727            result,
5728        )
5729    }};
5730}
5731
5732/// `collect_optimal_candidates_initialized` body parameterized over the per-CPU
5733/// kernel: the `$cpl` path is the kernel's `common_prefix_len_ptr` (used in
5734/// the HC chain walk fallback), and the four method-name substitutions
5735/// (`$bt_update`, `$bt_insert`, `$for_each_rep`, `$hash3`) route to the
5736/// kernel-specific wrappers of the inner helpers. With every helper under
5737/// the same `target_feature` umbrella, the entire per-position pipeline
5738/// (BT-tree fill + rep probing + hash3 probing + BT match collection /
5739/// HC chain walk) inlines without ABI barriers on the level22 hot path.
5740macro_rules! collect_optimal_candidates_initialized_body {
5741    (
5742        $self:expr,
5743        $strategy_ty:ty,
5744        $abs_pos:ident,
5745        $current_abs_end:ident,
5746        $profile:ident,
5747        $query:ident,
5748        $out:ident,
5749        $bt_matchfinder:ident,
5750        $bt_update:ident,
5751        $bt_insert:ident,
5752        $for_each_rep:ident,
5753        $hash3:ident,
5754        $cpl:path $(,)?
5755    ) => {{
5756        // Per-strategy compile-time const: only BtUltra2 drives the
5757        // hash3 short-match table. All other monomorphisations drop
5758        // the entire hash3 lookup block at codegen time. The relaxed
5759        // implication enforces only the direction we depend on:
5760        // if the strategy declares hash3, the table must be live.
5761        // The reverse (`hash3_log != 0` without `USE_HASH3`) is OK —
5762        // a future caller may pre-allocate hash3 storage without
5763        // wiring the BtUltra2 path through.
5764        let use_hash3: bool = <$strategy_ty as super::strategy::Strategy>::USE_HASH3;
5765        debug_assert!(!$self.table.hash_table.is_empty());
5766        debug_assert!($self.table.hash3_log == 0 || !$self.table.hash3_table.is_empty());
5767        debug_assert!(
5768            !use_hash3 || $self.table.hash3_log != 0,
5769            "Strategy::USE_HASH3 = true but runtime hash3_log is 0 — call configure() first",
5770        );
5771        debug_assert!(!$self.table.chain_table.is_empty());
5772        let min_match_len = HC_OPT_MIN_MATCH_LEN;
5773        let reps = $query.reps;
5774        let lit_len = $query.lit_len;
5775        let ldm_candidate = $query.ldm_candidate;
5776        $out.clear();
5777        if $abs_pos < $self.table.skip_insert_until_abs {
5778            if let Some(ldm) = ldm_candidate {
5779                let mut best_len_for_skip = 0usize;
5780                let _ = super::bt::BtMatcher::push_candidate_ladder(
5781                    $out,
5782                    &mut best_len_for_skip,
5783                    ldm,
5784                    min_match_len,
5785                );
5786            }
5787            return;
5788        }
5789        if $bt_matchfinder {
5790            // SAFETY: caller is in the same target_feature umbrella as
5791            // `$bt_update`; the runtime kernel detector already gated entry.
5792            unsafe { $self.table.$bt_update($abs_pos, $current_abs_end) };
5793        }
5794        let current_idx = $abs_pos - $self.table.history_abs_start;
5795        if current_idx + 4 > $self.table.live_history().len() {
5796            if let Some(ldm) = ldm_candidate {
5797                let mut best_len_for_skip = 0usize;
5798                let _ = super::bt::BtMatcher::push_candidate_ladder(
5799                    $out,
5800                    &mut best_len_for_skip,
5801                    ldm,
5802                    min_match_len,
5803                );
5804            }
5805            return;
5806        }
5807        let mut best_len_for_skip = 0usize;
5808        let mut skip_further_match_search = false;
5809        let mut rep_len_candidate_found = false;
5810        // SAFETY: same umbrella; closure capture is monomorphized per call.
5811        unsafe {
5812            $self.hc.$for_each_rep(
5813                &$self.table,
5814                $abs_pos,
5815                lit_len,
5816                reps,
5817                $current_abs_end,
5818                min_match_len,
5819                |rep| {
5820                    if rep.match_len >= min_match_len {
5821                        rep_len_candidate_found = true;
5822                    }
5823                    let _ = super::bt::BtMatcher::push_candidate_ladder(
5824                        $out,
5825                        &mut best_len_for_skip,
5826                        rep,
5827                        min_match_len,
5828                    );
5829                    if rep.match_len > $profile.sufficient_match_len {
5830                        skip_further_match_search = true;
5831                    }
5832                    // `for_each_repcode_candidate_with_reps` caps
5833                    // `rep.match_len` at the per-call `tail_limit =
5834                    // current_abs_end - abs_pos`, so `abs_pos +
5835                    // rep.match_len <= current_abs_end`. The raw sum
5836                    // therefore stays in `usize` on every supported
5837                    // target.
5838                    if $abs_pos + rep.match_len >= $current_abs_end {
5839                        skip_further_match_search = true;
5840                    }
5841                },
5842            )
5843        };
5844        // Hash3 lookup runs only when the strategy enables it. The
5845        // `use_hash3` binding above is a per-monomorphisation const,
5846        // so non-BtUltra2 instances drop this entire block.
5847        if use_hash3 && !skip_further_match_search && best_len_for_skip < min_match_len {
5848            $self.table.update_hash3_until($abs_pos);
5849            // SAFETY: same umbrella for hash3_candidate.
5850            if let Some(h3) = unsafe {
5851                $self
5852                    .table
5853                    .$hash3($abs_pos, $current_abs_end, min_match_len)
5854            } {
5855                let _ = super::bt::BtMatcher::push_candidate_ladder(
5856                    $out,
5857                    &mut best_len_for_skip,
5858                    h3,
5859                    min_match_len,
5860                );
5861                if !rep_len_candidate_found
5862                    && (h3.match_len > $profile.sufficient_match_len
5863                        || $abs_pos + h3.match_len >= $current_abs_end)
5864                {
5865                    $self.table.skip_insert_until_abs = $abs_pos + 1;
5866                    skip_further_match_search = true;
5867                }
5868            }
5869        }
5870        if !skip_further_match_search && $bt_matchfinder {
5871            // SAFETY: same umbrella for bt_insert_and_collect_matches.
5872            unsafe {
5873                $self.table.$bt_insert(
5874                    $abs_pos,
5875                    $current_abs_end,
5876                    $profile,
5877                    min_match_len,
5878                    &mut best_len_for_skip,
5879                    $out,
5880                )
5881            };
5882        } else if !skip_further_match_search {
5883            $self.table.insert_position($abs_pos);
5884            let max_chain_depth = $profile.max_chain_depth.min($self.hc.search_depth);
5885            let concat = $self.table.live_history();
5886            // Raw `+ 9` is safe here — see `bt_insert_step_no_rebase_body!`
5887            // for the full discussion of the upstream `STREAM_ABS_HEADROOM`
5888            // cap in `MatchTable::add_data`.
5889            let mut match_end_abs = $abs_pos + 9;
5890            if max_chain_depth > 0 {
5891                for (visited, candidate_abs) in $self
5892                    .hc
5893                    .chain_candidates(&$self.table, $abs_pos)
5894                    .into_iter()
5895                    .enumerate()
5896                {
5897                    if visited >= max_chain_depth {
5898                        break;
5899                    }
5900                    if candidate_abs == usize::MAX {
5901                        break;
5902                    }
5903                    if candidate_abs < $self.table.window_low_abs_for_target($abs_pos)
5904                        || candidate_abs >= $abs_pos
5905                    {
5906                        continue;
5907                    }
5908                    let candidate_idx = candidate_abs - $self.table.history_abs_start;
5909                    debug_assert!(
5910                        $abs_pos <= $current_abs_end,
5911                        "HC chain walker called past current block end"
5912                    );
5913                    let tail_limit = $current_abs_end - $abs_pos;
5914                    let base = concat.as_ptr();
5915                    // SAFETY: history-relative indices; `tail_limit` bounds
5916                    // the scan within `concat`. `$cpl` is the kernel-specific
5917                    // common_prefix_len_ptr — call inlines because the
5918                    // surrounding wrapper carries the same target_feature.
5919                    let match_len =
5920                        unsafe { $cpl(base.add(candidate_idx), base.add(current_idx), tail_limit) };
5921                    if match_len < min_match_len {
5922                        continue;
5923                    }
5924                    let offset = $abs_pos - candidate_abs;
5925                    if super::bt::BtMatcher::push_candidate_ladder(
5926                        $out,
5927                        &mut best_len_for_skip,
5928                        MatchCandidate {
5929                            start: $abs_pos,
5930                            offset,
5931                            match_len,
5932                        },
5933                        min_match_len,
5934                    ) {
5935                        let candidate_end = candidate_abs + match_len;
5936                        if candidate_end > match_end_abs {
5937                            match_end_abs = candidate_end;
5938                        }
5939                    }
5940                    if match_len > HC_OPT_NUM || $abs_pos + match_len >= $current_abs_end {
5941                        break;
5942                    }
5943                }
5944            }
5945            // `match_end_abs` initialized to `abs_pos + 9`; monotonic
5946            // updates only ever extend it, so `match_end_abs - 8 >= 1`.
5947            $self.table.skip_insert_until_abs =
5948                $self.table.skip_insert_until_abs.max(match_end_abs - 8);
5949        }
5950        if let Some(ldm) = ldm_candidate {
5951            let _ = super::bt::BtMatcher::push_candidate_ladder(
5952                $out,
5953                &mut best_len_for_skip,
5954                ldm,
5955                min_match_len,
5956            );
5957        }
5958    }};
5959}
5960
5961/// `hash3_candidate` body parameterized over the per-CPU
5962/// `common_prefix_len_ptr` symbol. The hash3 probe checks one candidate per
5963/// position when invoked, so the per-call ABI savings compound across the
5964/// segment. Crate-private (see `bt_insert_step_no_rebase_body!`).
5965macro_rules! hash3_candidate_body {
5966    (
5967        $table:expr,
5968        $abs_pos:ident,
5969        $current_abs_end:ident,
5970        $min_match_len:ident,
5971        $cpl:path $(,)?
5972    ) => {{
5973        if $table.hash3_log == 0 {
5974            return None;
5975        }
5976        let idx = $abs_pos.checked_sub($table.history_abs_start)?;
5977        let concat = $table.live_history();
5978        if idx + 4 > concat.len() {
5979            return None;
5980        }
5981        let hash3 = $crate::encoding::match_table::storage::MatchTable::hash_position_at(
5982            concat,
5983            idx,
5984            $table.hash3_log,
5985            3,
5986        );
5987        let entry = $table
5988            .hash3_table
5989            .get(hash3)
5990            .copied()
5991            .unwrap_or($crate::encoding::match_table::storage::HC_EMPTY);
5992        let candidate_abs =
5993            $crate::encoding::match_table::storage::MatchTable::stored_abs_position_fast(
5994                entry,
5995                $table.position_base,
5996                $table.index_shift,
5997            )?;
5998        if candidate_abs < $table.history_abs_start || candidate_abs >= $abs_pos {
5999            return None;
6000        }
6001        let offset = $abs_pos - candidate_abs;
6002        if offset >= $crate::encoding::bt::HC3_MAX_OFFSET {
6003            return None;
6004        }
6005        let candidate_idx = candidate_abs - $table.history_abs_start;
6006        let tail_limit = $current_abs_end.saturating_sub($abs_pos);
6007        let base = concat.as_ptr();
6008        // SAFETY: candidate/idx are within history range; tail_limit
6009        // bounds the scan within `concat`.
6010        let match_len = unsafe { $cpl(base.add(candidate_idx), base.add(idx), tail_limit) };
6011        (match_len >= $min_match_len).then_some($crate::encoding::opt::types::MatchCandidate {
6012            start: $abs_pos,
6013            offset,
6014            match_len,
6015        })
6016    }};
6017}
6018pub(crate) use hash3_candidate_body;
6019
6020/// `for_each_repcode_candidate_with_reps` body parameterized over the per-CPU
6021/// `common_prefix_len_ptr` symbol so the per-rep prefix probe inlines under
6022/// the wrapper's `target_feature` umbrella instead of crossing the ABI
6023/// boundary through the dispatcher. Three rep probes per encoded position →
6024/// thousands per segment, so the per-call barrier was non-trivial.
6025///
6026/// The callback `f` runs in the wrapper's umbrella context too, so closures
6027/// that capture mutable state still work (FnMut). Crate-private
6028/// (see `bt_insert_step_no_rebase_body!`).
6029macro_rules! for_each_repcode_candidate_body {
6030    (
6031        $table:expr,
6032        $abs_pos:ident,
6033        $lit_len:ident,
6034        $reps:ident,
6035        $current_abs_end:ident,
6036        $min_match_len:ident,
6037        $f:ident,
6038        $cpl:path $(,)?
6039    ) => {{
6040        let rep_offsets: [Option<usize>; 3] = if $lit_len == 0 {
6041            [
6042                Some($reps[1] as usize),
6043                Some($reps[2] as usize),
6044                ($reps[0] > 1).then_some(($reps[0] - 1) as usize),
6045            ]
6046        } else {
6047            [
6048                Some($reps[0] as usize),
6049                Some($reps[1] as usize),
6050                Some($reps[2] as usize),
6051            ]
6052        };
6053        let concat = $table.live_history();
6054        let current_idx = $abs_pos - $table.history_abs_start;
6055        if current_idx + 4 > concat.len() {
6056            return;
6057        }
6058        let tail_limit = $current_abs_end.saturating_sub($abs_pos);
6059        let base = concat.as_ptr();
6060        let concat_len = concat.len();
6061        for rep in rep_offsets.into_iter().flatten() {
6062            if rep == 0 || rep > $abs_pos {
6063                continue;
6064            }
6065            let candidate_pos = $abs_pos - rep;
6066            if candidate_pos < $table.history_abs_start {
6067                continue;
6068            }
6069            let candidate_idx = candidate_pos - $table.history_abs_start;
6070            // Upstream zstd `ZSTD_readMINMATCH` gate (zstd_opt.c:657-674): a
6071            // 4-byte (3-byte when min_match_len == 3) equality probe
6072            // before the full prefix scan. Equivalent filtering — a
6073            // mismatch here means `match_len < min_match_len`, which
6074            // the post-scan check rejects anyway — but it skips the
6075            // prefix-kernel call for the common no-match case (rep
6076            // offsets rarely hit on low-redundancy input).
6077            //
6078            // SAFETY: `current_idx + 4 <= concat_len` (early return
6079            // above) and `candidate_idx < current_idx` (rep >= 1), so
6080            // both 4-byte reads stay inside `concat`.
6081            let gate_matches = unsafe {
6082                let cand = base.add(candidate_idx).cast::<u32>().read_unaligned();
6083                let cur = base.add(current_idx).cast::<u32>().read_unaligned();
6084                if $min_match_len == 3 {
6085                    // Compare the low-address 3 bytes regardless of
6086                    // endianness: byte-shift on LE, mask via to_le.
6087                    (cand.to_le() & 0x00FF_FFFF) == (cur.to_le() & 0x00FF_FFFF)
6088                } else {
6089                    cand == cur
6090                }
6091            };
6092            if !gate_matches {
6093                continue;
6094            }
6095            // SAFETY: `candidate_idx ≤ current_idx < concat_len` (since
6096            // candidate_pos ≤ abs_pos and we early-returned on
6097            // `current_idx + 4 > concat_len`). `max` clamps to the shorter
6098            // remaining run so neither pointer overruns `concat`.
6099            let max = (concat_len - candidate_idx)
6100                .min(concat_len - current_idx)
6101                .min(tail_limit);
6102            let match_len = unsafe { $cpl(base.add(candidate_idx), base.add(current_idx), max) };
6103            if match_len < $min_match_len {
6104                continue;
6105            }
6106            $f(MatchCandidate {
6107                start: $abs_pos,
6108                offset: rep,
6109                match_len,
6110            });
6111        }
6112    }};
6113}
6114pub(crate) use for_each_repcode_candidate_body;
6115
6116/// `bt_insert_and_collect_matches` body parameterized over the per-CPU
6117/// `count_match_from_indices` symbol. Same shape as
6118/// [`bt_insert_step_no_rebase_body`] — picks up the matching kernel through
6119/// `$cmf` so the per-iteration vector probe inlines under the wrapper's
6120/// `target_feature` umbrella. Returns nothing (matches the original method).
6121/// Crate-private (see `bt_insert_step_no_rebase_body!`).
6122macro_rules! bt_insert_and_collect_matches_body {
6123    (
6124        $table:expr,
6125        $search_depth:expr,
6126        $abs_pos:ident,
6127        $current_abs_end:ident,
6128        $profile:ident,
6129        $min_match_len:ident,
6130        $best_len_for_skip:ident,
6131        $out:ident,
6132        $cmf:path $(,)?
6133    ) => {{
6134        let idx = $abs_pos - $table.history_abs_start;
6135        // Borrowed-aware live region (owned: `history[history_start..]`;
6136        // borrowed: the in-place input `[0, block_end)`). Reborrow-then-raw-ptr
6137        // so the slice holds NO borrow and coexists with the `&mut $table`
6138        // binary-tree writes below. Owned is byte-identical (same bytes).
6139        let concat: &[u8] = unsafe {
6140            let lh = $table.live_history();
6141            core::slice::from_raw_parts(lh.as_ptr(), lh.len())
6142        };
6143        if idx + 8 > concat.len() {
6144            return;
6145        }
6146        debug_assert!(
6147            $abs_pos <= $current_abs_end,
6148            "BT collect called past current block end"
6149        );
6150        let tail_limit = $current_abs_end - $abs_pos;
6151        let hash = $crate::encoding::match_table::storage::MatchTable::hash_position_at(
6152            concat,
6153            idx,
6154            $table.hash_log,
6155            $table.search_mls,
6156        );
6157        // Prefetch the hash bucket now. For the large L16+ hash table over
6158        // high-entropy input the bucket is L3/DRAM-cold, and unlike upstream's
6159        // monolithic ZSTD_btGetAllMatches (which overlaps this miss with its
6160        // inline rep/hash3 prologue) the read+write of `hash_table[hash]`
6161        // below is reached with nothing to hide it behind — it stalled a large
6162        // share of this function's cycles. Issuing the hint here lets the miss
6163        // overlap the address setup that follows.
6164        #[cfg(all(
6165            target_feature = "sse",
6166            any(target_arch = "x86", target_arch = "x86_64")
6167        ))]
6168        {
6169            #[cfg(target_arch = "x86")]
6170            use core::arch::x86::{_MM_HINT_T0, _mm_prefetch};
6171            #[cfg(target_arch = "x86_64")]
6172            use core::arch::x86_64::{_MM_HINT_T0, _mm_prefetch};
6173            // SAFETY: prefetch is a hint that never faults; `hash` indexes
6174            // `hash_table` directly below, so it is in bounds.
6175            unsafe {
6176                _mm_prefetch($table.hash_table.as_ptr().add(hash).cast(), _MM_HINT_T0);
6177            }
6178            // Prefetch the NEXT position's bucket too. The optimal-parser DP
6179            // advances one position per iteration, so this miss is issued a
6180            // full BT walk plus the next iteration's pre-collect work ahead of
6181            // the collect that will read it — far more lead than the same-call
6182            // hint above, enough to hide the full DRAM latency.
6183            if idx + 1 + 8 <= concat.len() {
6184                let hash_next =
6185                    $crate::encoding::match_table::storage::MatchTable::hash_position_at(
6186                        concat,
6187                        idx + 1,
6188                        $table.hash_log,
6189                        $table.search_mls,
6190                    );
6191                // SAFETY: prefetch never faults; an out-of-range index is a
6192                // harmless no-op hint.
6193                unsafe {
6194                    _mm_prefetch(
6195                        $table.hash_table.as_ptr().add(hash_next).cast(),
6196                        _MM_HINT_T0,
6197                    );
6198                }
6199            }
6200        }
6201        let Some(relative_pos) = $table.relative_position($abs_pos) else {
6202            return;
6203        };
6204        let stored = relative_pos + 1;
6205        let bt_mask = $table.bt_mask();
6206        // Hoist the BT pointer-pair table's base out of `self` once: every
6207        // access below is `chain_table[computed_index]` through `&mut self`,
6208        // which the optimizer cannot prove loop-invariant, so it reloads the
6209        // Vec's (ptr,len) from the struct AND bounds-checks on every tree
6210        // step (the upstream zstd walks a raw `U32* btable`, zstd_opt.c). The raw
6211        // base carries no borrow, so the `&self` helper calls in the loop
6212        // (`bt_pair_index_for_abs`, `window_low_abs_for_target`,
6213        // `relative_position`) coexist — they read other fields, never
6214        // `chain_table`. Indices are in bounds by the BT invariants:
6215        // `bt_pair_index_for_abs` returns `2*(abs & bt_mask) (+1)` ≤
6216        // `chain_table.len()-1`, and the slots only ever hold those values.
6217        let chain_ptr = $table.chain_table.as_mut_ptr();
6218        debug_assert_eq!($table.chain_table.len(), 2 << $table.bt_log());
6219        // See `bt_insert_step_no_rebase_body!`: saturating is needed for the
6220        // first BT walk of a fresh frame where `abs_pos < bt_mask`.
6221        let bt_low = $abs_pos.saturating_sub(bt_mask);
6222        let window_low = $table.window_low_abs_for_target($abs_pos);
6223        // Upstream zstd-style window bound in stored space so the BT-walk loop
6224        // condition rejects out-of-window / HC_EMPTY candidates WITHOUT
6225        // decoding them (mirrors upstream `while ... matchIndex >= matchLow`):
6226        // one range check on `match_stored` instead of decode-then-break,
6227        // dropping the wasted candidate_abs decode on every walk's terminating
6228        // step. candidate_abs(s) = (position_base + s - 1) - index_shift =
6229        // base + s (wrapping); in-window ⟺ candidate_abs - window_low <
6230        // abs_pos - window_low ⟺ s.wrapping_add(win_off) < win_range.
6231        // HC_EMPTY (s = 0) maps to base = (lowest representable abs) - 1 <
6232        // window_low, so it falls out of range and ends the walk.
6233        let win_off = $table
6234            .position_base
6235            .wrapping_sub(1)
6236            .wrapping_sub($table.index_shift)
6237            .wrapping_sub(window_low);
6238        let win_range = $abs_pos - window_low;
6239        // Raw `+ 9` is safe here — see `bt_insert_step_no_rebase_body!`
6240        // for the full discussion of the upstream `STREAM_ABS_HEADROOM`
6241        // cap in `MatchTable::add_data`.
6242        let mut match_end_abs = $abs_pos + 9;
6243        let mut compares_left = $profile.max_chain_depth.min($search_depth);
6244        let mut common_length_smaller = 0usize;
6245        let mut common_length_larger = 0usize;
6246        let pair_idx = $table.bt_pair_index_for_abs($abs_pos);
6247        let mut smaller_slot = pair_idx;
6248        let mut larger_slot = pair_idx + 1;
6249        let mut match_stored = $table.hash_table[hash];
6250        $table.hash_table[hash] = stored;
6251        // Upstream zstd semantics: `bestLength` starts at `lengthToBeat - 1`; rep/hash3
6252        // probing may raise it; BT then only reports strictly longer matches.
6253        // `min_match_len >= HC_FORMAT_MINMATCH (3)` by configure invariant,
6254        // so `min_match_len - 1 >= 2` cannot underflow.
6255        debug_assert!(
6256            $min_match_len >= $crate::encoding::cost_model::HC_FORMAT_MINMATCH,
6257            "min_match_len must be at least HC_FORMAT_MINMATCH"
6258        );
6259        let mut best_len = (*$best_len_for_skip).max($min_match_len - 1);
6260
6261        // Upstream zstd-form loop condition: the stored-space window range check
6262        // (`s.wrapping_add(win_off) < win_range`) rejects out-of-window and
6263        // HC_EMPTY candidates here, so the terminating step never enters the
6264        // body — no wasted candidate_abs decode, matching upstream's
6265        // `while ... matchIndex >= matchLow`.
6266        while compares_left > 0 && (match_stored as usize).wrapping_add(win_off) < win_range {
6267            compares_left -= 1;
6268            // The condition proved this candidate is in `[window_low,
6269            // abs_pos)`, so `match_stored >= 1` (HC_EMPTY is out of range) and
6270            // the `- 1` cannot underflow; candidate_abs == base + match_stored.
6271            let candidate_abs = ($table.position_base + (match_stored as usize - 1))
6272                .wrapping_sub($table.index_shift);
6273
6274            let next_pair_idx = $table.bt_pair_index_for_abs(candidate_abs);
6275            // SAFETY: `next_pair_idx (+1)` = `2*(candidate_abs & bt_mask) (+1)`
6276            // ≤ `chain_table.len()-1`; `chain_ptr` is the hoisted live base,
6277            // table not realloc'd during the walk.
6278            let next_smaller = unsafe { *chain_ptr.add(next_pair_idx) };
6279            let next_larger = unsafe { *chain_ptr.add(next_pair_idx + 1) };
6280            let seed_len = common_length_smaller.min(common_length_larger);
6281            let candidate_idx = candidate_abs - $table.history_abs_start;
6282            // SAFETY: BT walk invariant — `candidate_idx + tail_limit ≤
6283            // concat.len()`.
6284            let match_len = unsafe { $cmf(concat, idx, candidate_idx, tail_limit, seed_len) };
6285
6286            if match_len > best_len {
6287                let offset = $abs_pos - candidate_abs;
6288                let accepted = $crate::encoding::bt::BtMatcher::push_candidate_ladder(
6289                    $out,
6290                    $best_len_for_skip,
6291                    $crate::encoding::opt::types::MatchCandidate {
6292                        start: $abs_pos,
6293                        offset,
6294                        match_len,
6295                    },
6296                    $min_match_len,
6297                );
6298                if accepted {
6299                    best_len = match_len;
6300                    // BT walker invariants: `candidate_abs < abs_pos`
6301                    // and `match_len <= tail_limit = current_abs_end -
6302                    // abs_pos`. So `candidate_abs + match_len <
6303                    // abs_pos + tail_limit = current_abs_end`, which
6304                    // fits in `usize` on every supported target (32-bit
6305                    // i686 included) — the addition stays within the
6306                    // current block.
6307                    let candidate_end = candidate_abs + match_len;
6308                    if candidate_end > match_end_abs {
6309                        match_end_abs = candidate_end;
6310                    }
6311                    if match_len >= tail_limit
6312                        || match_len > $crate::encoding::cost_model::HC_OPT_NUM
6313                    {
6314                        break;
6315                    }
6316                }
6317            }
6318
6319            if match_len >= tail_limit {
6320                break;
6321            }
6322
6323            let candidate_next = candidate_idx + match_len;
6324            let current_next = idx + match_len;
6325            // SAFETY: first-differing positions after a match_len-long prefix;
6326            // match_len < tail_limit (break above) + BT-walk bound
6327            // idx/candidate_idx + tail_limit <= concat.len() keep both in range.
6328            if unsafe {
6329                *concat.get_unchecked(candidate_next) < *concat.get_unchecked(current_next)
6330            } {
6331                // SAFETY: `smaller_slot` holds a valid pair index (init
6332                // `pair_idx`, updated to `next_pair_idx + 1`); the `usize::MAX`
6333                // sentinel is set only just before `break`, never written here.
6334                unsafe { *chain_ptr.add(smaller_slot) = match_stored };
6335                common_length_smaller = match_len;
6336                if candidate_abs <= bt_low {
6337                    smaller_slot = usize::MAX;
6338                    break;
6339                }
6340                smaller_slot = next_pair_idx + 1;
6341                match_stored = next_larger;
6342            } else {
6343                // SAFETY: as above for `larger_slot`.
6344                unsafe { *chain_ptr.add(larger_slot) = match_stored };
6345                common_length_larger = match_len;
6346                if candidate_abs <= bt_low {
6347                    larger_slot = usize::MAX;
6348                    break;
6349                }
6350                larger_slot = next_pair_idx;
6351                match_stored = next_smaller;
6352            }
6353        }
6354
6355        // SAFETY: both slots, when not the `usize::MAX` sentinel, hold valid
6356        // pair indices into the hoisted `chain_table` base.
6357        if smaller_slot != usize::MAX {
6358            unsafe {
6359                *chain_ptr.add(smaller_slot) = $crate::encoding::match_table::storage::HC_EMPTY
6360            };
6361        }
6362        if larger_slot != usize::MAX {
6363            unsafe {
6364                *chain_ptr.add(larger_slot) = $crate::encoding::match_table::storage::HC_EMPTY
6365            };
6366        }
6367
6368        // Dict dual-probe (upstream zstd `ZSTD_dictMatchState`, zstd_opt.c:777-813):
6369        // after the live tree, descend the immutable dictionary BINARY TREE
6370        // (built in `prime_dms_bt`) with its OWN compare budget and push any
6371        // dict match longer than the live best into the ladder. The DUBT
6372        // descent reaches the longest dict match efficiently (a hash-chain
6373        // surfaced only the few same-bucket candidates and left most of the
6374        // dict savings unrealised at btlazy2 / btopt). Dict positions are
6375        // dictionary-relative concat indices in `[0, region)`, pinned at the
6376        // front of history, so a dict candidate at `dict_idx` sits at offset
6377        // `idx - dict_idx` (no upstream zstd `dmsIndexDelta`). The optimal parser
6378        // prices these (its DP lookahead values the repcode chain a dict match
6379        // seeds); the greedy/lazy parser commits the longest.
6380        if let Some(dms) = $table.dms.table() {
6381            let region = $table.dms.region_len();
6382            let dh = $crate::encoding::match_table::storage::MatchTable::hash_position_at(
6383                concat,
6384                idx,
6385                dms.hash_log,
6386                dms.mls,
6387            );
6388            let mut dcur = dms.hash_table[dh];
6389            // DUBT seed lengths: bytes already known common on each side, so
6390            // `$cmf` resumes from there (upstream zstd commonLengthSmaller/Larger).
6391            let mut common_smaller = 0usize;
6392            let mut common_larger = 0usize;
6393            let mut dms_compares = $profile.max_chain_depth.min($search_depth);
6394            while dms_compares > 0 && dcur != $crate::encoding::match_table::storage::HC_EMPTY {
6395                let dict_idx = (dcur - 1) as usize;
6396                // The dict tree holds only dict positions (`< region <= idx`).
6397                if dict_idx >= region || dict_idx >= idx {
6398                    break;
6399                }
6400                dms_compares -= 1;
6401                let pair = 2 * dict_idx;
6402                let seed = common_smaller.min(common_larger);
6403                // SAFETY: `dict_idx < idx` and `idx + tail_limit <=
6404                // concat.len()` (checked at entry); same umbrella as the live
6405                // walk's `$cmf`. `seed <= prior match_len <= tail_limit`.
6406                let match_len = unsafe { $cmf(concat, idx, dict_idx, tail_limit, seed) };
6407                if match_len > best_len {
6408                    let offset = idx - dict_idx;
6409                    let accepted = $crate::encoding::bt::BtMatcher::push_candidate_ladder(
6410                        $out,
6411                        $best_len_for_skip,
6412                        $crate::encoding::opt::types::MatchCandidate {
6413                            start: $abs_pos,
6414                            offset,
6415                            match_len,
6416                        },
6417                        $min_match_len,
6418                    );
6419                    if accepted {
6420                        best_len = match_len;
6421                        let candidate_end = $abs_pos + match_len;
6422                        if candidate_end > match_end_abs {
6423                            match_end_abs = candidate_end;
6424                        }
6425                        if match_len > $crate::encoding::cost_model::HC_OPT_NUM {
6426                            break;
6427                        }
6428                    }
6429                }
6430                // Match reached the block tail: can't order the pair (upstream zstd
6431                // `ip+matchLength == iLimit`), and indexing `concat[idx +
6432                // match_len]` below would step past the searchable region.
6433                if match_len >= tail_limit {
6434                    break;
6435                }
6436                // Descend the DUBT (upstream zstd zstd_opt.c:806-811): dict candidate
6437                // smaller than input → its larger child is closer to `idx`.
6438                if concat[dict_idx + match_len] < concat[idx + match_len] {
6439                    common_smaller = match_len;
6440                    dcur = dms.chain_table[pair + 1];
6441                } else {
6442                    common_larger = match_len;
6443                    dcur = dms.chain_table[pair];
6444                }
6445            }
6446        }
6447
6448        // `match_end_abs >= abs_pos + 9 >= 9` (initialized and monotonic),
6449        // so `match_end_abs - 8 >= 1` cannot underflow.
6450        $table.skip_insert_until_abs = match_end_abs - 8;
6451    }};
6452}
6453pub(crate) use bt_insert_and_collect_matches_body;
6454
6455impl HcMatchGenerator {
6456    /// Heap bytes this generator owns: the shared match table plus the BT
6457    /// backend's optimal-parser / LDM scratch (the HC knobs are inline).
6458    fn heap_size(&self) -> usize {
6459        self.table.heap_size() + self.backend.heap_size()
6460    }
6461
6462    fn should_run_btultra2_seed_pass<S: super::strategy::Strategy>(
6463        &self,
6464        current_len: usize,
6465    ) -> bool {
6466        // The in-block two-pass dynamic-stats seed (`initStats_ultra`)
6467        // is btultra2-only. `TWO_PASS_SEED` is `false` for every other
6468        // strategy — including btultra, which now shares the hash3
6469        // short-match probe but stays single-pass — so the seed call and
6470        // its body drop at codegen time for all non-btultra2 kernels.
6471        if !S::TWO_PASS_SEED {
6472            return false;
6473        }
6474        let HcBackend::Bt(bt) = &self.backend else {
6475            return false;
6476        };
6477        bt.opt_state.lit_length_sum == 0
6478            && bt.opt_state.dictionary_seed.is_none()
6479            && !self.table.dictionary_primed_for_frame
6480            && bt.ldm_sequences.is_empty()
6481            && self.table.window_size == current_len
6482            && self.table.history_abs_start == 0
6483            && self.table.chunk_lens.len() == 1
6484            && current_len > HC_PREDEF_THRESHOLD
6485    }
6486
6487    fn new(max_window_size: usize) -> Self {
6488        Self {
6489            table: super::match_table::storage::MatchTable::new(max_window_size),
6490            hc: super::hc::HcMatcher::new(2, HC_SEARCH_DEPTH, HC_TARGET_LEN),
6491            // Default to the zero-sized HC backend; `configure()` swaps
6492            // in a `BtMatcher` only when an optimal strategy lands.
6493            backend: HcBackend::Hc,
6494            // Lazy is the per-construct default — every production
6495            // caller calls `configure()` before the first encode and
6496            // overwrites this. Tests that drive `HcMatchGenerator`
6497            // without calling `configure()` end up in the
6498            // `start_matching_lazy` arm of the test dispatcher, which
6499            // matches the previous default behaviour.
6500            strategy_tag: super::strategy::StrategyTag::Lazy,
6501        }
6502    }
6503
6504    fn configure(&mut self, config: HcConfig, tag: super::strategy::StrategyTag, window_log: u8) {
6505        use super::strategy::StrategyTag;
6506        // Mirror the driver-resolved strategy tag so the
6507        // `#[cfg(test)] start_matching` dispatcher can route
6508        // BtOpt / BtUltra / BtUltra2 to distinct monomorphisations.
6509        self.strategy_tag = tag;
6510        let is_btultra2 = tag == StrategyTag::BtUltra2;
6511        let uses_bt = matches!(
6512            tag,
6513            StrategyTag::Btlazy2
6514                | StrategyTag::BtOpt
6515                | StrategyTag::BtUltra
6516                | StrategyTag::BtUltra2
6517        );
6518        // btultra and btultra2 both run the mls=3 hash3 short-match probe
6519        // (clevels.h minMatch 3). The `is_btultra2` flag below stays
6520        // exclusive to btultra2 because it tweaks the BT rebase boundary,
6521        // not match finding.
6522        let wants_hash3 = matches!(tag, StrategyTag::BtUltra | StrategyTag::BtUltra2);
6523        let next_hash3_log = if wants_hash3 {
6524            HC3_HASH_LOG.min(window_log as usize)
6525        } else {
6526            0
6527        };
6528        let resize = self.table.hash_log != config.hash_log
6529            || self.table.chain_log != config.chain_log
6530            || self.table.hash3_log != next_hash3_log;
6531        // Capture the layout flip BEFORE `uses_bt` is overwritten below — it
6532        // feeds the dms invalidation (the dms is keyed by layout too).
6533        let uses_bt_changed = self.table.uses_bt != uses_bt;
6534        self.table.hash_log = config.hash_log;
6535        self.table.chain_log = config.chain_log;
6536        self.table.hash3_log = next_hash3_log;
6537        self.hc.search_depth = if uses_bt {
6538            config.search_depth
6539        } else {
6540            config.search_depth.min(MAX_HC_SEARCH_DEPTH)
6541        };
6542        self.hc.target_len = config.target_len;
6543        // Mirror strategy-derived flags + HC search depth onto MatchTable
6544        // so the BT walker and rebase machinery can read them directly
6545        // without dispatching back through HcMatchGenerator.
6546        self.table.search_depth = self.hc.search_depth;
6547        self.table.is_btultra2 = is_btultra2;
6548        self.table.uses_bt = uses_bt;
6549        // BT finder hash width, upstream zstd `mls = BOUNDED(4, cParams.minMatch, 6)`,
6550        // carried explicitly in the level config so a `target_length` override
6551        // cannot silently flip the finder between 5- and 4-byte hashing. Only
6552        // the BT body reads it; HC/lazy levels leave it at 4. clevels.h
6553        // (srcSize > 256 KiB tier): btlazy2 L13-15 + btopt L16 are minMatch=5,
6554        // btopt L17 is minMatch=4, btultra/btultra2 are minMatch=3 (4-byte main
6555        // hash + the hash3 short-match probe).
6556        // The cached dms is keyed by the full (region, layout, mls, hash_log)
6557        // shape that `build_dms!` validates on the normal prime path, but the
6558        // reborrow fast path in `MatchTable::reset` reuses it on `dms.is_primed()`
6559        // ALONE. A reused-compressor level switch can change the search mls (e.g.
6560        // btlazy2 -> lazy), the table geometry (hash_log / chain_log / hash3,
6561        // captured in `resize`), OR the HC<->BT layout (`uses_bt_changed`)
6562        // independently of each other, and any of them leaves the dms hashed for
6563        // a different shape. Invalidate on ANY so the next dict frame re-primes at
6564        // the new shape (configure runs before reset) instead of probing a
6565        // mismatched dms and silently degrading match quality. Over-invalidation
6566        // only costs a re-prime, which a real shape change needs anyway.
6567        let mls_changed = self.table.search_mls != config.search_mls;
6568        if resize || mls_changed || uses_bt_changed {
6569            self.table.dms.invalidate();
6570        }
6571        self.table.search_mls = config.search_mls;
6572        // Stage D: promote the backend discriminator. HC modes drop the
6573        // BT scratch buffers entirely; switching back into a BT mode
6574        // allocates a fresh `BtMatcher` on demand.
6575        match (&self.backend, self.table.uses_bt) {
6576            (HcBackend::Hc, true) => {
6577                self.backend = HcBackend::Bt(alloc::boxed::Box::new(super::bt::BtMatcher::new()));
6578            }
6579            (HcBackend::Bt(_), false) => {
6580                self.backend = HcBackend::Hc;
6581            }
6582            _ => {}
6583        }
6584        if resize && !self.table.hash_table.is_empty() {
6585            // Force reallocation on next ensure_tables() call.
6586            self.table.hash_table.clear();
6587            self.table.hash3_table.clear();
6588            self.table.chain_table.clear();
6589        }
6590    }
6591
6592    fn seed_dictionary_entropy(
6593        &mut self,
6594        huff: Option<&crate::huff0::huff0_encoder::HuffmanTable>,
6595        ll: Option<&crate::fse::fse_encoder::FSETable>,
6596        ml: Option<&crate::fse::fse_encoder::FSETable>,
6597        of: Option<&crate::fse::fse_encoder::FSETable>,
6598    ) {
6599        if let HcBackend::Bt(bt) = &mut self.backend {
6600            bt.opt_state.seed_dictionary_entropy(huff, ll, ml, of);
6601        }
6602    }
6603
6604    /// Install (or clear) the long-distance-match producer (#27). Only
6605    /// the BT backend owns an `ldm_producer` slot; on the HC (lazy)
6606    /// backend the producer is dropped because there is no optimal-parser
6607    /// candidate buffer to seed. Call after [`Self::reset`].
6608    #[cfg(feature = "hash")]
6609    fn set_ldm_producer(&mut self, producer: Option<super::ldm::LdmProducer>) {
6610        if let HcBackend::Bt(bt) = &mut self.backend {
6611            bt.ldm_producer = producer;
6612        }
6613    }
6614
6615    /// Move the LDM producer out of the BT backend, leaving `None`. Used by the
6616    /// dictionary snapshot path: the producer carries no dictionary state (LDM
6617    /// is not dict-primed; its hash table is empty at capture), so it is not
6618    /// retained in the snapshot — the working frame's freshly-reset producer is
6619    /// reinstated on restore instead.
6620    #[cfg(feature = "hash")]
6621    fn take_ldm_producer(&mut self) -> Option<super::ldm::LdmProducer> {
6622        if let HcBackend::Bt(bt) = &mut self.backend {
6623            bt.ldm_producer.take()
6624        } else {
6625            None
6626        }
6627    }
6628
6629    fn reset(&mut self, reuse_space: impl FnMut(Vec<u8>)) {
6630        self.table.reset(reuse_space);
6631        if let HcBackend::Bt(bt) = &mut self.backend {
6632            bt.reset();
6633        }
6634    }
6635
6636    /// Backfill positions from the tail of the previous slice that couldn't be
6637    /// hashed at the time (insert_position needs 4 bytes of lookahead).
6638    fn skip_matching(&mut self, incompressible_hint: Option<bool>) {
6639        self.table.skip_matching(incompressible_hint);
6640    }
6641
6642    /// Runtime-dispatched entry kept only for in-crate tests. Production
6643    /// callers reach the inner loops through
6644    /// [`Self::start_matching_strategy`] / [`MatchGeneratorDriver::compress_block`]
6645    /// which pick the lazy / optimal arm from `S::USE_BT` at
6646    /// monomorphisation time.
6647    #[cfg(test)]
6648    fn start_matching(&mut self, mut handle_sequence: impl for<'a> FnMut(Sequence<'a>)) {
6649        use super::strategy::{self, StrategyTag};
6650        // Dispatch on the mirrored `strategy_tag` so each test runs
6651        // under the same monomorphisation production would pick.
6652        // `BtOpt` / `BtUltra` / `BtUltra2` remain distinct here even
6653        // though `table.uses_bt` / `is_btultra2` alone can't separate
6654        // BtOpt from BtUltra.
6655        match self.strategy_tag {
6656            StrategyTag::Fast | StrategyTag::Dfast | StrategyTag::Greedy | StrategyTag::Lazy => {
6657                self.start_matching_lazy(&mut handle_sequence)
6658            }
6659            StrategyTag::Btlazy2 => self.start_matching_btlazy2(&mut handle_sequence),
6660            StrategyTag::BtOpt => {
6661                self.start_matching_optimal::<strategy::BtOpt>(&mut handle_sequence)
6662            }
6663            StrategyTag::BtUltra => {
6664                self.start_matching_optimal::<strategy::BtUltra>(&mut handle_sequence)
6665            }
6666            StrategyTag::BtUltra2 => {
6667                self.start_matching_optimal::<strategy::BtUltra2>(&mut handle_sequence)
6668            }
6669        }
6670    }
6671
6672    /// Strategy-aware entry point used by
6673    /// [`MatchGeneratorDriver::compress_block`]. Branches on
6674    /// `S::USE_BT` — a compile-time `const` — so each
6675    /// monomorphisation keeps exactly one arm: `Lazy` /
6676    /// `Fast` / `Dfast` / `Greedy` see only `start_matching_lazy`,
6677    /// `BtOpt` / `BtUltra` / `BtUltra2` see only
6678    /// `start_matching_optimal`. The inherent test-only
6679    /// [`HcMatchGenerator::start_matching`] reaches the same arms by
6680    /// runtime-matching on `self.strategy_tag` (the parse-mode field
6681    /// has been removed); production never invokes that path.
6682    pub(crate) fn start_matching_strategy<S: super::strategy::Strategy>(
6683        &mut self,
6684        handle_sequence: &mut impl for<'a> FnMut(Sequence<'a>),
6685    ) {
6686        debug_assert_eq!(
6687            self.table.uses_bt,
6688            S::USE_BT,
6689            "Strategy::USE_BT disagrees with runtime table.uses_bt at HC dispatch"
6690        );
6691        if S::USE_BT {
6692            self.start_matching_optimal::<S>(handle_sequence)
6693        } else {
6694            self.start_matching_lazy(handle_sequence)
6695        }
6696    }
6697
6698    /// Dispatcher: pick the dict-aware monomorph when a separate dms is primed
6699    /// (attach-mode dictionary), else the no-dict monomorph. Mirrors upstream's
6700    /// compile-time `dictMode` split — the `DICT = false` body carries no dms
6701    /// code at all, so the no-dict hot path is unaffected by the dict search.
6702    pub(crate) fn start_matching_lazy(
6703        &mut self,
6704        handle_sequence: impl for<'a> FnMut(Sequence<'a>),
6705    ) {
6706        if self.table.dms.is_primed() {
6707            self.start_matching_lazy_impl::<true>(handle_sequence);
6708        } else {
6709            self.start_matching_lazy_impl::<false>(handle_sequence);
6710        }
6711    }
6712
6713    fn start_matching_lazy_impl<const DICT: bool>(
6714        &mut self,
6715        mut handle_sequence: impl for<'a> FnMut(Sequence<'a>),
6716    ) {
6717        self.table.ensure_tables();
6718
6719        // `current_block_range()` is borrowed-aware: owned → last committed
6720        // chunk; borrowed → the staged in-place block range.
6721        let (current_abs_start, current_len) = self.table.current_block_range();
6722        if current_len == 0 {
6723            return;
6724        }
6725        // The current block is the tail of `history` (owned) or the staged
6726        // borrowed range (`get_last_space()` resolves both). Hoist it as a raw
6727        // slice: the routine mutates the hash/chain tables + `offset_hist` but
6728        // never reallocates `history`, so the slice stays valid and we avoid
6729        // re-borrowing `self.table` (which would conflict with the
6730        // `offset_hist` write).
6731        let current_ptr = self.table.get_last_space().as_ptr();
6732        let current: &[u8] = unsafe { core::slice::from_raw_parts(current_ptr, current_len) };
6733
6734        // Full live history (dict + committed blocks + current block), hoisted
6735        // ONCE for the whole position scan and threaded into every
6736        // `find_best_match` / `pick_lazy_match` call. `live_history()` is
6737        // loop-invariant here (the scan mutates the hash/chain tables +
6738        // `offset_hist` but never the history bytes or length), so re-fetching
6739        // it per find — inside `hash_chain_candidate` + the rep probe, plus
6740        // again for each lazy lookahead at pos+1 / pos+2 — was pure
6741        // per-position overhead. Same raw-slice detach as `current` so the
6742        // loop's `&mut self.table` inserts coexist with this `&[u8]`.
6743        let concat: &[u8] = {
6744            let lh = self.table.live_history();
6745            unsafe { core::slice::from_raw_parts(lh.as_ptr(), lh.len()) }
6746        };
6747        // Dict-match-state primed flag, hoisted ONCE for the scan: it is
6748        // block-invariant (the dict is primed before the block) and lives on the
6749        // cold `dms` cacheline, so the per-find `dms.is_primed()` load was a
6750        // measurable hot-path cost (~8% of `hash_chain_candidate` on the
6751        // dict-over-random fixture). The `DICT = false` monomorph ignores it.
6752        let dms_primed = self.table.dms.is_primed();
6753
6754        let current_abs_end = current_abs_start + current_len;
6755        self.table
6756            .backfill_boundary_positions(current_abs_start, current_abs_end);
6757
6758        let mut pos = 0usize;
6759        let mut literals_start = 0usize;
6760        while pos + HC_MIN_MATCH_LEN <= current_len {
6761            let abs_pos = current_abs_start + pos;
6762            let lit_len = pos - literals_start;
6763
6764            // `find_best_match` returns the forward `(offset, length)` in
6765            // registers (`HcMatch`, 16 bytes) — no 24-byte `MatchCandidate` /
6766            // 32-byte `Option` spilled-and-copied per position. The backward
6767            // extension that yields `start` runs ONCE here, after the lazy
6768            // decision settles, exactly like upstream's lazy loop.
6769            let best =
6770                self.hc
6771                    .find_best_match::<DICT>(concat, dms_primed, &self.table, abs_pos, lit_len);
6772            if best.is_match() {
6773                if self.hc.pick_lazy_match::<DICT>(
6774                    concat,
6775                    dms_primed,
6776                    &self.table,
6777                    abs_pos,
6778                    lit_len,
6779                    best,
6780                ) {
6781                    // Backward-extend over the literal run (upstream `zstd_lazy.c`
6782                    // after rep-vs-chain selection). The offset is preserved;
6783                    // `start` and `match_len` grow by the same amount, bounded by
6784                    // `literals_start` (the `min_abs` floor) so it never crosses
6785                    // an already-emitted sequence.
6786                    let history_abs_start = self.table.history_abs_start;
6787                    let min_abs = abs_pos - lit_len;
6788                    let mut start_abs = abs_pos;
6789                    let mut cand_abs = abs_pos - best.offset;
6790                    let mut match_len = best.match_len;
6791                    while start_abs > min_abs
6792                        && cand_abs > history_abs_start
6793                        && concat[cand_abs - history_abs_start - 1]
6794                            == concat[start_abs - history_abs_start - 1]
6795                    {
6796                        start_abs -= 1;
6797                        cand_abs -= 1;
6798                        match_len += 1;
6799                    }
6800                    self.table.insert_match_span(abs_pos, start_abs + match_len);
6801                    let start = start_abs - current_abs_start;
6802                    let literals = &current[literals_start..start];
6803                    handle_sequence(Sequence::Triple {
6804                        literals,
6805                        offset: best.offset,
6806                        match_len,
6807                    });
6808                    let _ = encode_offset_with_history(
6809                        best.offset as u32,
6810                        literals.len() as u32,
6811                        &mut self.table.offset_hist,
6812                    );
6813                    pos = start + match_len;
6814                    literals_start = pos;
6815                    continue;
6816                }
6817                // Lazy lookahead found a better match at `abs_pos + 1` / `+ 2`
6818                // (defer): advance exactly ONE byte (upstream
6819                // `ZSTD_compressBlock_lazy_generic`) so the deferred candidate is
6820                // re-evaluated at its own position; the no-match skip below could
6821                // jump past it once the literal run reaches 256+ bytes.
6822                self.table.insert_position(abs_pos);
6823                pos += 1;
6824                continue;
6825            }
6826            // No match found.
6827            self.table.insert_position(abs_pos);
6828            // Lazy skipping (upstream zstd `ZSTD_compressBlock_lazy_generic`,
6829            // zstd_lazy.c:1614): advance faster over runs with no match.
6830            // `step = ((ip - anchor) >> kSearchStrength) + 1` with
6831            // kSearchStrength = 8, where `ip - anchor` is the current
6832            // literal-run length. On compressible input the run stays short
6833            // (step == 1, identical to a 1-byte advance); on incompressible
6834            // / dict-over-random input the run grows so the parser skips
6835            // ahead (one search per `step` positions) instead of searching
6836            // every byte. Skipped positions are not inserted, mirroring
6837            // upstream (it inserts only searched positions during a no-match
6838            // run). Ratio follows upstream (not byte-identical).
6839            let step = ((pos - literals_start) >> 8) + 1;
6840            pos += step;
6841            // No clamp needed before the tail loop: the search bound and the
6842            // hashable bound are both `pos + HC_MIN_MATCH_LEN <= current_len`
6843            // (HC_MIN_MATCH_LEN == 4 == the insert width), so there is no
6844            // non-searchable-but-hashable anchor to miss. Positions the skip
6845            // jumps over inside the searchable region are intentionally not
6846            // inserted — same as upstream zstd, which advances past them via
6847            // the identical `ip += step` and never hashes them either.
6848        }
6849
6850        // Insert remaining hashable positions in the tail (the matching loop
6851        // stops at HC_MIN_MATCH_LEN but insert_position only needs 4 bytes).
6852        while pos + 4 <= current_len {
6853            self.table.insert_position(current_abs_start + pos);
6854            pos += 1;
6855        }
6856
6857        if literals_start < current_len {
6858            handle_sequence(Sequence::Literals {
6859                literals: &current[literals_start..],
6860            });
6861        }
6862    }
6863
6864    /// Register the borrowed input window for the no-copy one-shot path.
6865    /// # Safety
6866    /// `buffer` must outlive the borrowed scans (see `MatchTable`).
6867    pub(crate) unsafe fn set_borrowed_window(&mut self, buffer: &[u8]) {
6868        // SAFETY: forwarded liveness contract.
6869        unsafe { self.table.set_borrowed_window(buffer) };
6870    }
6871
6872    pub(crate) fn clear_borrowed_window(&mut self) {
6873        self.table.clear_borrowed_window();
6874    }
6875
6876    /// Borrowed (no-copy) equivalent of [`Self::start_matching_lazy`]: stage
6877    /// the in-place block range, then run the same lazy chain parse. The
6878    /// parse reads its range via `current_block_range()` and its bytes via
6879    /// `get_last_space()` / `live_history()`, all borrowed-aware, so the block
6880    /// is scanned in place with the per-position window_low offset cap.
6881    pub(crate) fn start_matching_lazy_borrowed(
6882        &mut self,
6883        block_start: usize,
6884        block_end: usize,
6885        handle_sequence: impl for<'a> FnMut(Sequence<'a>),
6886    ) {
6887        self.table.stage_borrowed_block(block_start, block_end);
6888        self.start_matching_lazy(handle_sequence);
6889    }
6890
6891    /// Borrowed (no-copy) equivalent of the lazy `skip_matching`: stage the
6892    /// in-place block, then seed positions without an owned-history append.
6893    pub(crate) fn skip_matching_borrowed(
6894        &mut self,
6895        block_start: usize,
6896        block_end: usize,
6897        incompressible_hint: Option<bool>,
6898    ) {
6899        self.table.stage_borrowed_block(block_start, block_end);
6900        self.table.skip_matching(incompressible_hint);
6901    }
6902
6903    /// Upstream zstd `ZSTD_btlazy2` (levels 13-15): binary-tree match finder with a
6904    /// greedy/lazy parse. Bare dispatcher — resolves the runtime tier ONCE
6905    /// per block via `select_kernel()` and calls the matching
6906    /// `start_matching_btlazy2_<kernel>` wrapper, so the per-position BT
6907    /// collect runs under a single `#[target_feature]` umbrella (mirrors
6908    /// `build_optimal_plan_impl`). See `start_matching_btlazy2_body!` for the
6909    /// shared loop.
6910    fn start_matching_btlazy2(&mut self, mut handle_sequence: impl for<'a> FnMut(Sequence<'a>)) {
6911        #[cfg(all(target_arch = "aarch64", target_endian = "little"))]
6912        unsafe {
6913            self.start_matching_btlazy2_neon(&mut handle_sequence)
6914        }
6915        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
6916        {
6917            use crate::encoding::fastpath::{FastpathKernel, select_kernel};
6918            match select_kernel() {
6919                FastpathKernel::Avx2Bmi2 => unsafe {
6920                    self.start_matching_btlazy2_avx2_bmi2(&mut handle_sequence)
6921                },
6922                FastpathKernel::Sse42 => unsafe {
6923                    self.start_matching_btlazy2_sse42(&mut handle_sequence)
6924                },
6925                FastpathKernel::Scalar => self.start_matching_btlazy2_scalar(&mut handle_sequence),
6926            }
6927        }
6928        #[cfg(not(any(
6929            all(target_arch = "aarch64", target_endian = "little"),
6930            target_arch = "x86",
6931            target_arch = "x86_64"
6932        )))]
6933        {
6934            self.start_matching_btlazy2_scalar(&mut handle_sequence)
6935        }
6936    }
6937
6938    #[cfg(all(target_arch = "aarch64", target_endian = "little"))]
6939    #[target_feature(enable = "neon")]
6940    unsafe fn start_matching_btlazy2_neon(
6941        &mut self,
6942        mut handle_sequence: impl for<'a> FnMut(Sequence<'a>),
6943    ) {
6944        start_matching_btlazy2_body!(
6945            self,
6946            handle_sequence,
6947            collect_optimal_candidates_initialized_neon,
6948            crate::encoding::fastpath::neon::count_match_from_indices
6949        )
6950    }
6951
6952    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
6953    #[target_feature(enable = "sse4.2")]
6954    unsafe fn start_matching_btlazy2_sse42(
6955        &mut self,
6956        mut handle_sequence: impl for<'a> FnMut(Sequence<'a>),
6957    ) {
6958        start_matching_btlazy2_body!(
6959            self,
6960            handle_sequence,
6961            collect_optimal_candidates_initialized_sse42,
6962            crate::encoding::fastpath::sse42::count_match_from_indices
6963        )
6964    }
6965
6966    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
6967    #[target_feature(enable = "avx2,bmi2")]
6968    unsafe fn start_matching_btlazy2_avx2_bmi2(
6969        &mut self,
6970        mut handle_sequence: impl for<'a> FnMut(Sequence<'a>),
6971    ) {
6972        start_matching_btlazy2_body!(
6973            self,
6974            handle_sequence,
6975            collect_optimal_candidates_initialized_avx2_bmi2,
6976            crate::encoding::fastpath::avx2_bmi2::count_match_from_indices
6977        )
6978    }
6979
6980    // Scalar wrapper: no `#[target_feature]`; `$collect` (the scalar collect)
6981    // is a safe fn, so the body macro's `unsafe` block is inert here. Same cfg
6982    // as `collect_optimal_candidates_initialized_scalar` (absent on
6983    // aarch64-little, where NEON is the baseline tier).
6984    #[cfg(not(all(target_arch = "aarch64", target_endian = "little")))]
6985    #[allow(unused_unsafe)]
6986    fn start_matching_btlazy2_scalar(
6987        &mut self,
6988        mut handle_sequence: impl for<'a> FnMut(Sequence<'a>),
6989    ) {
6990        start_matching_btlazy2_body!(
6991            self,
6992            handle_sequence,
6993            collect_optimal_candidates_initialized_scalar,
6994            crate::encoding::fastpath::scalar::count_match_from_indices
6995        )
6996    }
6997
6998    fn start_matching_optimal<S: super::strategy::Strategy>(
6999        &mut self,
7000        mut handle_sequence: impl for<'a> FnMut(Sequence<'a>),
7001    ) {
7002        self.table.ensure_tables();
7003        // Borrowed-aware: owned → last committed chunk; borrowed → staged
7004        // in-place block range.
7005        let (current_abs_start, current_len) = self.table.current_block_range();
7006        if current_len == 0 {
7007            return;
7008        }
7009        let current_ptr = self.table.get_last_space().as_ptr();
7010        // `start_matching_optimal()` mutates tables/state but never mutates or
7011        // reallocates `self.table.history`, so this tail slice remains valid for
7012        // the duration of the routine and avoids cloning the full block.
7013        let current = unsafe { core::slice::from_raw_parts(current_ptr, current_len) };
7014
7015        let current_abs_end = current_abs_start + current_len;
7016        self.table
7017            .apply_limited_update_after_long_match(current_abs_start);
7018        let hash3_start_cursor = self
7019            .table
7020            .skip_insert_until_abs
7021            .max(self.table.history_abs_start);
7022        self.table
7023            .backfill_boundary_positions(current_abs_start, current_abs_end);
7024        self.table.next_to_update3 = hash3_start_cursor;
7025        // Borrow split: `prepare_ldm_candidates` needs immutable
7026        // access to the live history (the post-`history_start`
7027        // slice of `self.table.history`) while it mutates the LDM
7028        // bucket table owned by `self.backend.bt_mut()`. Both live
7029        // in disjoint fields of `Self`, so we capture the slice +
7030        // its base before reaching for `bt_mut()`.
7031        //
7032        // The producer operates in absolute stream coordinates
7033        // throughout; `live_history[0]` corresponds to absolute
7034        // `history_abs_start` (upstream zstd `base + dictLimit`), and the
7035        // abs→slice translation happens inside the producer at
7036        // each `live_history[..]` access. Passing the full
7037        // `history` Vec would index into the dead prefix (the
7038        // bytes already retired past `history_start`).
7039        let live_history = self.table.live_history();
7040        let history_abs_start = self.table.history_abs_start;
7041        self.backend.bt_mut().prepare_ldm_candidates(
7042            live_history,
7043            history_abs_start,
7044            current_abs_start,
7045            current_len,
7046        );
7047
7048        if self.should_run_btultra2_seed_pass::<S>(current_len) {
7049            self.run_btultra2_seed_pass(current, current_abs_start, current_len);
7050        }
7051
7052        // Const-generic profile selection: every field is folded from
7053        // S's associated consts (MAX_CHAIN_DEPTH /
7054        // SUFFICIENT_MATCH_LEN / ACCURATE_PRICE / FAVOR_SMALL_OFFSETS),
7055        // so the optimiser produces the literal at codegen time
7056        // without a runtime match.
7057        let profile = HcOptimalCostProfile::const_for_strategy::<S>();
7058        let mut opt_state =
7059            core::mem::replace(&mut self.backend.bt_mut().opt_state, HcOptState::new());
7060        opt_state.rescale_freqs(current, profile);
7061        let mut best_plan = core::mem::take(&mut self.backend.bt_mut().opt_segment_plan_scratch);
7062        best_plan.clear();
7063        let mut plan_reps = self.table.offset_hist;
7064        let (mut cursor, mut plan_litlen) =
7065            self.table.opt_start_cursor_and_litlen(current_abs_start);
7066        let mut plan_literals_cursor = 0usize;
7067        let match_loop_limit = current_len.saturating_sub(8);
7068        while cursor < match_loop_limit {
7069            let remaining_len = current_len - cursor;
7070            let segment_abs_start = current_abs_start + cursor;
7071            let segment_start = best_plan.len();
7072            let (_, end_reps, end_litlen, consumed_len) = self.build_optimal_plan::<S>(
7073                &current[cursor..],
7074                segment_abs_start,
7075                remaining_len,
7076                HcOptimalPlanState {
7077                    block_offset: cursor,
7078                    reps: plan_reps,
7079                    litlen: plan_litlen,
7080                    profile,
7081                },
7082                &opt_state,
7083                &mut best_plan,
7084            );
7085            BtMatcher::update_plan_stats_segment(
7086                current,
7087                current_len,
7088                &best_plan[segment_start..],
7089                &mut plan_literals_cursor,
7090                &mut plan_reps,
7091                &mut opt_state,
7092                profile.accurate,
7093            );
7094            plan_reps = end_reps;
7095            plan_litlen = end_litlen;
7096            cursor += consumed_len;
7097        }
7098
7099        self.table
7100            .emit_optimal_plan(current_len, &best_plan, &mut handle_sequence);
7101        best_plan.clear();
7102        self.backend.bt_mut().opt_segment_plan_scratch = best_plan;
7103        self.backend.bt_mut().opt_state = opt_state;
7104    }
7105
7106    fn run_btultra2_seed_pass(
7107        &mut self,
7108        current: &[u8],
7109        current_abs_start: usize,
7110        current_len: usize,
7111    ) {
7112        // The seed pass is BtUltra2-exclusive by name (the only
7113        // caller is `should_run_btultra2_seed_pass`), so pin `S` to
7114        // `BtUltra2` for both the cost-profile lookup and the
7115        // `build_optimal_plan::<S>` call below.
7116        type S = super::strategy::BtUltra2;
7117        let seed_profile = HcOptimalCostProfile::const_for_strategy::<S>();
7118        let mut opt_state =
7119            core::mem::replace(&mut self.backend.bt_mut().opt_state, HcOptState::new());
7120        opt_state.rescale_freqs(current, seed_profile);
7121        let mut seed_reps = self.table.offset_hist;
7122        let (mut cursor, mut seed_litlen) =
7123            self.table.opt_start_cursor_and_litlen(current_abs_start);
7124        let mut seed_literals_cursor = 0usize;
7125        let mut seed_plan = core::mem::take(&mut self.backend.bt_mut().opt_seed_plan_scratch);
7126        seed_plan.clear();
7127        let match_loop_limit = current_len.saturating_sub(8);
7128        while cursor < match_loop_limit {
7129            let remaining_len = current_len - cursor;
7130            let segment_abs_start = current_abs_start + cursor;
7131            let segment_start = seed_plan.len();
7132            let (_, end_reps, end_litlen, consumed_len) = self.build_optimal_plan::<S>(
7133                &current[cursor..],
7134                segment_abs_start,
7135                remaining_len,
7136                HcOptimalPlanState {
7137                    block_offset: cursor,
7138                    reps: seed_reps,
7139                    litlen: seed_litlen,
7140                    profile: seed_profile,
7141                },
7142                &opt_state,
7143                &mut seed_plan,
7144            );
7145            BtMatcher::update_plan_stats_segment(
7146                current,
7147                current_len,
7148                &seed_plan[segment_start..],
7149                &mut seed_literals_cursor,
7150                &mut seed_reps,
7151                &mut opt_state,
7152                seed_profile.accurate,
7153            );
7154            seed_plan.truncate(segment_start);
7155            seed_reps = end_reps;
7156            seed_litlen = end_litlen;
7157            cursor += consumed_len;
7158        }
7159        seed_plan.clear();
7160        self.backend.bt_mut().opt_seed_plan_scratch = seed_plan;
7161        self.backend.bt_mut().opt_state = opt_state;
7162
7163        // Upstream zstd initStats_ultra keeps the collected entropy statistics but
7164        // invalidates the first-pass matchfinder history before the real pass.
7165        self.table.position_base = self.table.history_abs_start;
7166        self.table.index_shift = current_len;
7167        self.table.next_to_update3 = current_abs_start;
7168        self.table.skip_insert_until_abs = current_abs_start;
7169        // Upstream zstd `ZSTD_initStats_ultra()` invalidates the first scan by moving
7170        // `window.base` back by `srcSize`, making the real pass start at
7171        // `curr == srcSize` instead of 0. Position 0 is therefore a valid
7172        // table entry in the second pass even though raw C tables reserve
7173        // value 0 as empty during an unshifted first pass.
7174        self.table.allow_zero_relative_position = true;
7175    }
7176
7177    fn build_optimal_plan<S: super::strategy::Strategy>(
7178        &mut self,
7179        current: &[u8],
7180        current_abs_start: usize,
7181        current_len: usize,
7182        initial_state: HcOptimalPlanState,
7183        stats: &HcOptState,
7184        out: &mut Vec<HcOptimalSequence>,
7185    ) -> (u32, [u32; 3], usize, usize) {
7186        debug_assert!(S::USE_BT, "build_optimal_plan called on non-BT strategy");
7187        debug_assert_eq!(initial_state.profile.accurate, S::ACCURATE_PRICE);
7188        debug_assert_eq!(
7189            initial_state.profile.favor_small_offsets,
7190            S::FAVOR_SMALL_OFFSETS
7191        );
7192        // `S::ACCURATE_PRICE` / `S::FAVOR_SMALL_OFFSETS` cannot appear
7193        // as const-generic arguments yet (`generic_const_exprs` is
7194        // still unstable), so dispatch over a 4-arm match — but on the
7195        // strategy's ASSOCIATED CONSTS, not the runtime profile (the
7196        // `debug_assert_eq`s above pin the runtime profile to those
7197        // consts). A const scrutinee folds the three dead arms at
7198        // monomorphisation; matching the runtime profile instead kept
7199        // all four `#[inline(always)]` DP bodies (~16 KB each) alive in
7200        // EVERY `S` instantiation — ~360 KB of the wasm payload.
7201        match (S::ACCURATE_PRICE, S::FAVOR_SMALL_OFFSETS) {
7202            (true, false) => self.build_optimal_plan_impl::<S, true, false>(
7203                current,
7204                current_abs_start,
7205                current_len,
7206                initial_state,
7207                stats,
7208                out,
7209            ),
7210            (true, true) => self.build_optimal_plan_impl::<S, true, true>(
7211                current,
7212                current_abs_start,
7213                current_len,
7214                initial_state,
7215                stats,
7216                out,
7217            ),
7218            (false, false) => self.build_optimal_plan_impl::<S, false, false>(
7219                current,
7220                current_abs_start,
7221                current_len,
7222                initial_state,
7223                stats,
7224                out,
7225            ),
7226            (false, true) => self.build_optimal_plan_impl::<S, false, true>(
7227                current,
7228                current_abs_start,
7229                current_len,
7230                initial_state,
7231                stats,
7232                out,
7233            ),
7234        }
7235    }
7236
7237    /// Cross-platform DP entry. Picks the kernel-specific variant so the
7238    /// entire optimal-parser DP body (per-position match gathering, price
7239    /// updates, traceback) runs inside a single `target_feature` umbrella
7240    /// alongside the per-position `collect_optimal_candidates_initialized_
7241    /// <kernel>`. This eliminates the final ABI barrier on the hot per-
7242    /// position match-collection call — the level22 critical path is now
7243    /// one straight-line inline chain from DP body down through BT walk
7244    /// and match-length probes.
7245    #[inline(always)]
7246    fn build_optimal_plan_impl<
7247        S: super::strategy::Strategy,
7248        const ACCURATE_PRICE: bool,
7249        const FAVOR_SMALL_OFFSETS: bool,
7250    >(
7251        &mut self,
7252        current: &[u8],
7253        current_abs_start: usize,
7254        current_len: usize,
7255        initial_state: HcOptimalPlanState,
7256        stats: &HcOptState,
7257        out: &mut Vec<HcOptimalSequence>,
7258    ) -> (u32, [u32; 3], usize, usize) {
7259        #[cfg(all(target_arch = "aarch64", target_endian = "little"))]
7260        unsafe {
7261            self.build_optimal_plan_impl_neon::<S, ACCURATE_PRICE, FAVOR_SMALL_OFFSETS>(
7262                current,
7263                current_abs_start,
7264                current_len,
7265                initial_state,
7266                stats,
7267                out,
7268            )
7269        }
7270        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
7271        {
7272            use crate::encoding::fastpath::{FastpathKernel, select_kernel};
7273            match select_kernel() {
7274                FastpathKernel::Avx2Bmi2 => unsafe {
7275                    self.build_optimal_plan_impl_avx2_bmi2::<S, ACCURATE_PRICE, FAVOR_SMALL_OFFSETS>(
7276                        current,
7277                        current_abs_start,
7278                        current_len,
7279                        initial_state,
7280                        stats,
7281                        out,
7282                    )
7283                },
7284                FastpathKernel::Sse42 => unsafe {
7285                    self.build_optimal_plan_impl_sse42::<S, ACCURATE_PRICE, FAVOR_SMALL_OFFSETS>(
7286                        current,
7287                        current_abs_start,
7288                        current_len,
7289                        initial_state,
7290                        stats,
7291                        out,
7292                    )
7293                },
7294                FastpathKernel::Scalar => self
7295                    .build_optimal_plan_impl_scalar::<S, ACCURATE_PRICE, FAVOR_SMALL_OFFSETS>(
7296                        current,
7297                        current_abs_start,
7298                        current_len,
7299                        initial_state,
7300                        stats,
7301                        out,
7302                    ),
7303            }
7304        }
7305        // wasm with simd128: route through the simd128 DP body (4-lane price-set).
7306        #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
7307        unsafe {
7308            self.build_optimal_plan_impl_simd128::<S, ACCURATE_PRICE, FAVOR_SMALL_OFFSETS>(
7309                current,
7310                current_abs_start,
7311                current_len,
7312                initial_state,
7313                stats,
7314                out,
7315            )
7316        }
7317        #[cfg(not(any(
7318            all(target_arch = "aarch64", target_endian = "little"),
7319            target_arch = "x86",
7320            target_arch = "x86_64",
7321            all(target_arch = "wasm32", target_feature = "simd128")
7322        )))]
7323        {
7324            self.build_optimal_plan_impl_scalar::<S, ACCURATE_PRICE, FAVOR_SMALL_OFFSETS>(
7325                current,
7326                current_abs_start,
7327                current_len,
7328                initial_state,
7329                stats,
7330                out,
7331            )
7332        }
7333    }
7334
7335    /// NEON-umbrella DP body. Inlines
7336    /// `collect_optimal_candidates_initialized_neon` (and its entire
7337    /// per-position pipeline) directly into the DP loop.
7338    #[cfg(all(target_arch = "aarch64", target_endian = "little"))]
7339    #[target_feature(enable = "neon")]
7340    unsafe fn build_optimal_plan_impl_neon<
7341        S: super::strategy::Strategy,
7342        const ACCURATE_PRICE: bool,
7343        const FAVOR_SMALL_OFFSETS: bool,
7344    >(
7345        &mut self,
7346        current: &[u8],
7347        current_abs_start: usize,
7348        current_len: usize,
7349        initial_state: HcOptimalPlanState,
7350        stats: &HcOptState,
7351        out: &mut Vec<HcOptimalSequence>,
7352    ) -> (u32, [u32; 3], usize, usize) {
7353        build_optimal_plan_impl_body!(
7354            self,
7355            S,
7356            current,
7357            current_abs_start,
7358            current_len,
7359            initial_state,
7360            stats,
7361            out,
7362            collect_optimal_candidates_initialized_neon,
7363            priceset_range_nonabort_neon,
7364        )
7365    }
7366
7367    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
7368    #[target_feature(enable = "sse4.2")]
7369    unsafe fn build_optimal_plan_impl_sse42<
7370        S: super::strategy::Strategy,
7371        const ACCURATE_PRICE: bool,
7372        const FAVOR_SMALL_OFFSETS: bool,
7373    >(
7374        &mut self,
7375        current: &[u8],
7376        current_abs_start: usize,
7377        current_len: usize,
7378        initial_state: HcOptimalPlanState,
7379        stats: &HcOptState,
7380        out: &mut Vec<HcOptimalSequence>,
7381    ) -> (u32, [u32; 3], usize, usize) {
7382        build_optimal_plan_impl_body!(
7383            self,
7384            S,
7385            current,
7386            current_abs_start,
7387            current_len,
7388            initial_state,
7389            stats,
7390            out,
7391            collect_optimal_candidates_initialized_sse42,
7392            priceset_range_nonabort_sse41,
7393        )
7394    }
7395
7396    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
7397    #[target_feature(enable = "avx2,bmi2")]
7398    unsafe fn build_optimal_plan_impl_avx2_bmi2<
7399        S: super::strategy::Strategy,
7400        const ACCURATE_PRICE: bool,
7401        const FAVOR_SMALL_OFFSETS: bool,
7402    >(
7403        &mut self,
7404        current: &[u8],
7405        current_abs_start: usize,
7406        current_len: usize,
7407        initial_state: HcOptimalPlanState,
7408        stats: &HcOptState,
7409        out: &mut Vec<HcOptimalSequence>,
7410    ) -> (u32, [u32; 3], usize, usize) {
7411        build_optimal_plan_impl_body!(
7412            self,
7413            S,
7414            current,
7415            current_abs_start,
7416            current_len,
7417            initial_state,
7418            stats,
7419            out,
7420            collect_optimal_candidates_initialized_avx2_bmi2,
7421            priceset_range_nonabort_avx2,
7422        )
7423    }
7424
7425    #[cfg(not(all(target_arch = "aarch64", target_endian = "little")))]
7426    // Body macros wrap callees in `unsafe { }` for the NEON/AVX/SSE
7427    // variants where callees are `unsafe fn`. The scalar wrappers route
7428    // through safe fns, so those blocks are redundant on this path.
7429    #[allow(unused_unsafe)]
7430    // The dispatch reaches this only on non-SIMD x86 (Scalar tier) and the
7431    // portable fallback; on wasm+simd128 the simd128 wrapper is selected, so
7432    // this is cfg-dead there.
7433    #[cfg_attr(
7434        all(target_arch = "wasm32", target_feature = "simd128"),
7435        allow(dead_code)
7436    )]
7437    fn build_optimal_plan_impl_scalar<
7438        S: super::strategy::Strategy,
7439        const ACCURATE_PRICE: bool,
7440        const FAVOR_SMALL_OFFSETS: bool,
7441    >(
7442        &mut self,
7443        current: &[u8],
7444        current_abs_start: usize,
7445        current_len: usize,
7446        initial_state: HcOptimalPlanState,
7447        stats: &HcOptState,
7448        out: &mut Vec<HcOptimalSequence>,
7449    ) -> (u32, [u32; 3], usize, usize) {
7450        build_optimal_plan_impl_body!(
7451            self,
7452            S,
7453            current,
7454            current_abs_start,
7455            current_len,
7456            initial_state,
7457            stats,
7458            out,
7459            collect_optimal_candidates_initialized_scalar,
7460            priceset_range_nonabort_scalar,
7461        )
7462    }
7463
7464    /// wasm `simd128`-umbrella DP body: scalar candidate collection (no wasm
7465    /// collect kernel) but the simd128 4-lane price-set.
7466    #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
7467    #[target_feature(enable = "simd128")]
7468    // With `+simd128` in the wasm baseline the shared body macro's `unsafe`
7469    // blocks (needed by the safe scalar wrapper) are redundant inside this
7470    // target_feature fn.
7471    #[allow(unused_unsafe)]
7472    unsafe fn build_optimal_plan_impl_simd128<
7473        S: super::strategy::Strategy,
7474        const ACCURATE_PRICE: bool,
7475        const FAVOR_SMALL_OFFSETS: bool,
7476    >(
7477        &mut self,
7478        current: &[u8],
7479        current_abs_start: usize,
7480        current_len: usize,
7481        initial_state: HcOptimalPlanState,
7482        stats: &HcOptState,
7483        out: &mut Vec<HcOptimalSequence>,
7484    ) -> (u32, [u32; 3], usize, usize) {
7485        build_optimal_plan_impl_body!(
7486            self,
7487            S,
7488            current,
7489            current_abs_start,
7490            current_len,
7491            initial_state,
7492            stats,
7493            out,
7494            collect_optimal_candidates_initialized_scalar,
7495            priceset_range_nonabort_simd128,
7496        )
7497    }
7498
7499    #[cfg(test)]
7500    fn collect_optimal_candidates(
7501        &mut self,
7502        abs_pos: usize,
7503        current_abs_end: usize,
7504        profile: HcOptimalCostProfile,
7505        query: HcCandidateQuery,
7506        out: &mut Vec<MatchCandidate>,
7507    ) {
7508        use super::strategy::{self, StrategyTag};
7509        self.table.ensure_tables();
7510        // Dispatch purely from `self.strategy_tag` (set by
7511        // `configure()`). Tests must configure the matcher the same
7512        // way production does — wiring up `table.hash3_log` directly
7513        // without setting a matching `strategy_tag` is no longer
7514        // allowed.
7515        match self.strategy_tag {
7516            StrategyTag::BtUltra2 => self
7517                .collect_optimal_candidates_initialized::<strategy::BtUltra2, true>(
7518                    abs_pos,
7519                    current_abs_end,
7520                    profile,
7521                    query,
7522                    out,
7523                ),
7524            StrategyTag::BtUltra => self
7525                .collect_optimal_candidates_initialized::<strategy::BtUltra, true>(
7526                    abs_pos,
7527                    current_abs_end,
7528                    profile,
7529                    query,
7530                    out,
7531                ),
7532            StrategyTag::Btlazy2 => self
7533                .collect_optimal_candidates_initialized::<strategy::Btlazy2, true>(
7534                    abs_pos,
7535                    current_abs_end,
7536                    profile,
7537                    query,
7538                    out,
7539                ),
7540            StrategyTag::BtOpt => self
7541                .collect_optimal_candidates_initialized::<strategy::BtOpt, true>(
7542                    abs_pos,
7543                    current_abs_end,
7544                    profile,
7545                    query,
7546                    out,
7547                ),
7548            StrategyTag::Fast | StrategyTag::Dfast | StrategyTag::Greedy | StrategyTag::Lazy => {
7549                self.collect_optimal_candidates_initialized::<strategy::Lazy, false>(
7550                    abs_pos,
7551                    current_abs_end,
7552                    profile,
7553                    query,
7554                    out,
7555                )
7556            }
7557        }
7558    }
7559
7560    /// Cross-platform entry. Picks the kernel-specific variant so the per-
7561    /// position pipeline (BT-tree fill, rep probing, hash3 probing, BT
7562    /// collect / HC chain walk) runs inside a single `target_feature`
7563    /// umbrella — all inner SIMD probes inline without ABI barriers.
7564    ///
7565    /// The on-encode hot path bypasses this dispatcher: `build_optimal_plan_impl_<kernel>`
7566    /// calls the matching `_<kernel>` variant directly. This entry is kept
7567    /// for the cfg(test)-only `collect_optimal_candidates` shim and any
7568    /// future caller that isn't already inside a kernel umbrella.
7569    #[allow(dead_code)]
7570    #[inline(always)]
7571    fn collect_optimal_candidates_initialized<
7572        S: super::strategy::Strategy,
7573        const USE_BT_MATCHFINDER: bool,
7574    >(
7575        &mut self,
7576        abs_pos: usize,
7577        current_abs_end: usize,
7578        profile: HcOptimalCostProfile,
7579        query: HcCandidateQuery,
7580        out: &mut Vec<MatchCandidate>,
7581    ) {
7582        #[cfg(all(target_arch = "aarch64", target_endian = "little"))]
7583        unsafe {
7584            self.collect_optimal_candidates_initialized_neon::<S, USE_BT_MATCHFINDER>(
7585                abs_pos,
7586                current_abs_end,
7587                profile,
7588                query,
7589                out,
7590            )
7591        }
7592        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
7593        {
7594            use crate::encoding::fastpath::{FastpathKernel, select_kernel};
7595            match select_kernel() {
7596                FastpathKernel::Avx2Bmi2 => unsafe {
7597                    self.collect_optimal_candidates_initialized_avx2_bmi2::<S, USE_BT_MATCHFINDER>(
7598                        abs_pos,
7599                        current_abs_end,
7600                        profile,
7601                        query,
7602                        out,
7603                    )
7604                },
7605                FastpathKernel::Sse42 => unsafe {
7606                    self.collect_optimal_candidates_initialized_sse42::<S, USE_BT_MATCHFINDER>(
7607                        abs_pos,
7608                        current_abs_end,
7609                        profile,
7610                        query,
7611                        out,
7612                    )
7613                },
7614                FastpathKernel::Scalar => self
7615                    .collect_optimal_candidates_initialized_scalar::<S, USE_BT_MATCHFINDER>(
7616                        abs_pos,
7617                        current_abs_end,
7618                        profile,
7619                        query,
7620                        out,
7621                    ),
7622            }
7623        }
7624        #[cfg(not(any(
7625            all(target_arch = "aarch64", target_endian = "little"),
7626            target_arch = "x86",
7627            target_arch = "x86_64"
7628        )))]
7629        {
7630            self.collect_optimal_candidates_initialized_scalar::<S, USE_BT_MATCHFINDER>(
7631                abs_pos,
7632                current_abs_end,
7633                profile,
7634                query,
7635                out,
7636            )
7637        }
7638    }
7639
7640    /// NEON-umbrella variant. Every inner helper (`bt_update_tree_until_neon`,
7641    /// `for_each_repcode_candidate_with_reps_neon`, `hash3_candidate_neon`,
7642    /// `bt_insert_and_collect_matches_neon`, `fastpath::neon::
7643    /// common_prefix_len_ptr`) shares the NEON umbrella so the per-position
7644    /// pipeline executes as a single straight-line inline sequence.
7645    #[cfg(all(target_arch = "aarch64", target_endian = "little"))]
7646    #[target_feature(enable = "neon")]
7647    unsafe fn collect_optimal_candidates_initialized_neon<
7648        S: super::strategy::Strategy,
7649        const USE_BT_MATCHFINDER: bool,
7650    >(
7651        &mut self,
7652        abs_pos: usize,
7653        current_abs_end: usize,
7654        profile: HcOptimalCostProfile,
7655        query: HcCandidateQuery,
7656        out: &mut Vec<MatchCandidate>,
7657    ) {
7658        collect_optimal_candidates_initialized_body!(
7659            self,
7660            S,
7661            abs_pos,
7662            current_abs_end,
7663            profile,
7664            query,
7665            out,
7666            USE_BT_MATCHFINDER,
7667            bt_update_tree_until_neon,
7668            bt_insert_and_collect_matches_neon,
7669            for_each_repcode_candidate_with_reps_neon,
7670            hash3_candidate_neon,
7671            crate::encoding::fastpath::neon::common_prefix_len_ptr,
7672        )
7673    }
7674
7675    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
7676    #[target_feature(enable = "sse4.2")]
7677    unsafe fn collect_optimal_candidates_initialized_sse42<
7678        S: super::strategy::Strategy,
7679        const USE_BT_MATCHFINDER: bool,
7680    >(
7681        &mut self,
7682        abs_pos: usize,
7683        current_abs_end: usize,
7684        profile: HcOptimalCostProfile,
7685        query: HcCandidateQuery,
7686        out: &mut Vec<MatchCandidate>,
7687    ) {
7688        collect_optimal_candidates_initialized_body!(
7689            self,
7690            S,
7691            abs_pos,
7692            current_abs_end,
7693            profile,
7694            query,
7695            out,
7696            USE_BT_MATCHFINDER,
7697            bt_update_tree_until_sse42,
7698            bt_insert_and_collect_matches_sse42,
7699            for_each_repcode_candidate_with_reps_sse42,
7700            hash3_candidate_sse42,
7701            crate::encoding::fastpath::sse42::common_prefix_len_ptr,
7702        )
7703    }
7704
7705    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
7706    #[target_feature(enable = "avx2,bmi2")]
7707    unsafe fn collect_optimal_candidates_initialized_avx2_bmi2<
7708        S: super::strategy::Strategy,
7709        const USE_BT_MATCHFINDER: bool,
7710    >(
7711        &mut self,
7712        abs_pos: usize,
7713        current_abs_end: usize,
7714        profile: HcOptimalCostProfile,
7715        query: HcCandidateQuery,
7716        out: &mut Vec<MatchCandidate>,
7717    ) {
7718        collect_optimal_candidates_initialized_body!(
7719            self,
7720            S,
7721            abs_pos,
7722            current_abs_end,
7723            profile,
7724            query,
7725            out,
7726            USE_BT_MATCHFINDER,
7727            bt_update_tree_until_avx2_bmi2,
7728            bt_insert_and_collect_matches_avx2_bmi2,
7729            for_each_repcode_candidate_with_reps_avx2_bmi2,
7730            hash3_candidate_avx2_bmi2,
7731            crate::encoding::fastpath::avx2_bmi2::common_prefix_len_ptr,
7732        )
7733    }
7734
7735    #[cfg(not(all(target_arch = "aarch64", target_endian = "little")))]
7736    // Macro emits `unsafe { }` wrappers for NEON/AVX/SSE variants; scalar
7737    // callees are safe so the blocks are redundant here only.
7738    #[allow(unused_unsafe)]
7739    fn collect_optimal_candidates_initialized_scalar<
7740        S: super::strategy::Strategy,
7741        const USE_BT_MATCHFINDER: bool,
7742    >(
7743        &mut self,
7744        abs_pos: usize,
7745        current_abs_end: usize,
7746        profile: HcOptimalCostProfile,
7747        query: HcCandidateQuery,
7748        out: &mut Vec<MatchCandidate>,
7749    ) {
7750        collect_optimal_candidates_initialized_body!(
7751            self,
7752            S,
7753            abs_pos,
7754            current_abs_end,
7755            profile,
7756            query,
7757            out,
7758            USE_BT_MATCHFINDER,
7759            bt_update_tree_until_scalar,
7760            bt_insert_and_collect_matches_scalar,
7761            for_each_repcode_candidate_with_reps_scalar,
7762            hash3_candidate_scalar,
7763            crate::encoding::fastpath::scalar::common_prefix_len_ptr,
7764        )
7765    }
7766}
7767
7768#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
7769#[test]
7770fn matches() {
7771    let mut matcher = MatchGenerator::new(1000);
7772    let mut original_data = Vec::new();
7773    let mut reconstructed = Vec::new();
7774
7775    let replay_sequence = |seq: Sequence<'_>, reconstructed: &mut Vec<u8>| match seq {
7776        Sequence::Literals { literals } => {
7777            assert!(!literals.is_empty());
7778            reconstructed.extend_from_slice(literals);
7779        }
7780        Sequence::Triple {
7781            literals,
7782            offset,
7783            match_len,
7784        } => {
7785            assert!(offset > 0);
7786            assert!(match_len >= MIN_MATCH_LEN);
7787            reconstructed.extend_from_slice(literals);
7788            assert!(offset <= reconstructed.len());
7789            let start = reconstructed.len() - offset;
7790            for i in 0..match_len {
7791                let byte = reconstructed[start + i];
7792                reconstructed.push(byte);
7793            }
7794        }
7795    };
7796
7797    matcher.add_data(
7798        alloc::vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
7799        SuffixStore::with_capacity(100),
7800        |_, _| {},
7801    );
7802    original_data.extend_from_slice(&[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
7803
7804    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7805
7806    assert!(!matcher.next_sequence(|_| {}));
7807
7808    matcher.add_data(
7809        alloc::vec![
7810            1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 0, 0, 0, 0, 0,
7811        ],
7812        SuffixStore::with_capacity(100),
7813        |_, _| {},
7814    );
7815    original_data.extend_from_slice(&[
7816        1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 0, 0, 0, 0, 0,
7817    ]);
7818
7819    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7820    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7821    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7822    assert!(!matcher.next_sequence(|_| {}));
7823
7824    matcher.add_data(
7825        alloc::vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 0, 0, 0, 0],
7826        SuffixStore::with_capacity(100),
7827        |_, _| {},
7828    );
7829    original_data.extend_from_slice(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 0, 0, 0, 0]);
7830
7831    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7832    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7833    assert!(!matcher.next_sequence(|_| {}));
7834
7835    matcher.add_data(
7836        alloc::vec![0, 0, 0, 0, 0],
7837        SuffixStore::with_capacity(100),
7838        |_, _| {},
7839    );
7840    original_data.extend_from_slice(&[0, 0, 0, 0, 0]);
7841
7842    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7843    assert!(!matcher.next_sequence(|_| {}));
7844
7845    matcher.add_data(
7846        alloc::vec![7, 8, 9, 10, 11],
7847        SuffixStore::with_capacity(100),
7848        |_, _| {},
7849    );
7850    original_data.extend_from_slice(&[7, 8, 9, 10, 11]);
7851
7852    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7853    assert!(!matcher.next_sequence(|_| {}));
7854
7855    matcher.add_data(
7856        alloc::vec![1, 3, 5, 7, 9],
7857        SuffixStore::with_capacity(100),
7858        |_, _| {},
7859    );
7860    matcher.skip_matching();
7861    original_data.extend_from_slice(&[1, 3, 5, 7, 9]);
7862    reconstructed.extend_from_slice(&[1, 3, 5, 7, 9]);
7863    assert!(!matcher.next_sequence(|_| {}));
7864
7865    matcher.add_data(
7866        alloc::vec![1, 3, 5, 7, 9],
7867        SuffixStore::with_capacity(100),
7868        |_, _| {},
7869    );
7870    original_data.extend_from_slice(&[1, 3, 5, 7, 9]);
7871
7872    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7873    assert!(!matcher.next_sequence(|_| {}));
7874
7875    matcher.add_data(
7876        alloc::vec![0, 0, 11, 13, 15, 17, 20, 11, 13, 15, 17, 20, 21, 23],
7877        SuffixStore::with_capacity(100),
7878        |_, _| {},
7879    );
7880    original_data.extend_from_slice(&[0, 0, 11, 13, 15, 17, 20, 11, 13, 15, 17, 20, 21, 23]);
7881
7882    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7883    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7884    assert!(!matcher.next_sequence(|_| {}));
7885
7886    assert_eq!(reconstructed, original_data);
7887}
7888
7889#[test]
7890fn dfast_matches_roundtrip_multi_block_pattern() {
7891    let pattern = [9, 21, 44, 184, 19, 96, 171, 109, 141, 251];
7892    let first_block: Vec<u8> = pattern.iter().copied().cycle().take(128 * 1024).collect();
7893    let second_block: Vec<u8> = pattern.iter().copied().cycle().take(128 * 1024).collect();
7894
7895    let mut matcher = DfastMatchGenerator::new(1 << 22);
7896    let replay_sequence = |decoded: &mut Vec<u8>, seq: Sequence<'_>| match seq {
7897        Sequence::Literals { literals } => decoded.extend_from_slice(literals),
7898        Sequence::Triple {
7899            literals,
7900            offset,
7901            match_len,
7902        } => {
7903            decoded.extend_from_slice(literals);
7904            let start = decoded.len() - offset;
7905            for i in 0..match_len {
7906                let byte = decoded[start + i];
7907                decoded.push(byte);
7908            }
7909        }
7910    };
7911
7912    matcher.add_data(first_block.clone(), |_| {});
7913    let mut history = Vec::new();
7914    matcher.start_matching(|seq| replay_sequence(&mut history, seq));
7915    assert_eq!(history, first_block);
7916
7917    matcher.add_data(second_block.clone(), |_| {});
7918    let prefix_len = history.len();
7919    matcher.start_matching(|seq| replay_sequence(&mut history, seq));
7920
7921    assert_eq!(&history[prefix_len..], second_block.as_slice());
7922}
7923
7924/// Regression for the `DFAST_MIN_MATCH_LEN: 6 -> 5` drop. The fixture
7925/// is built so the longest available match is EXACTLY 5 bytes — a
7926/// matcher that still effectively requires a 6-byte floor would emit
7927/// only literals here and the assertion would catch the silent
7928/// 5-byte miss.
7929///
7930/// Fixture layout (34 B):
7931///   bytes 0..5    `"ABCDE"`  — match source
7932///   bytes 5..28   `'!'` × 23 — filler that does NOT start with 'A'
7933///   bytes 28..33  `"ABCDE"`  — match site (repeats the prefix)
7934///   byte  33      `'F'`      — terminator: differs from byte 5 (`'!'`),
7935///                              so the forward extension at the match
7936///                              site stops at exactly length 5.
7937///
7938/// A 5-byte match at offset 28 must be emitted; a 6-byte+ match at the
7939/// same offset must NOT.
7940#[test]
7941fn dfast_accepts_exact_five_byte_match() {
7942    // Layout the input so that:
7943    //   byte  0      = 'Z'            (lead byte — keeps the match SOURCE off
7944    //                                  position 0, which the greedy loop never
7945    //                                  inserts: like the upstream zstd it starts the
7946    //                                  cursor at ip+1 and hashes only visited
7947    //                                  positions)
7948    //   bytes 1..6   = "ABCDE"        (the match source — position 1 IS visited)
7949    //   bytes 6..29  = 23 filler bytes that do NOT start with 'A'
7950    //   bytes 29..34 = "ABCDE"        (the 5-byte match site)
7951    //   byte  34     = 'F'            (differs from byte 6 = '!')
7952    // The longest available copy at position 29 is exactly 5 bytes:
7953    // the byte at position 34 ('F') differs from the byte at position 6
7954    // ('!'), so the forward extension stops at length 5.
7955    let mut data = Vec::new();
7956    data.push(b'Z'); // 0
7957    data.extend_from_slice(b"ABCDE"); // 1..6
7958    data.extend_from_slice(b"!!!!!!!!!!!!!!!!!!!!!!!"); // 6..29 (23 bytes)
7959    data.extend_from_slice(b"ABCDE"); // 29..34
7960    data.push(b'F'); // 34: forces forward extension to stop at length 5
7961    // Trailing filler so the match site (29) sits at least HASH_READ_SIZE (8)
7962    // bytes before the block end. The greedy double-fast — like the upstream zstd —
7963    // stops probing at `ilimit = iend - HASH_READ_SIZE`, so a match in the
7964    // final 8 bytes is never searched (upstream zstd parity, not a regression).
7965    data.extend_from_slice(b"GHIJKLMNOPQRSTUVWXYZ"); // 35..55
7966    assert_eq!(data.len(), 55);
7967
7968    let mut matcher = DfastMatchGenerator::new(1 << 22);
7969    matcher.add_data(data.clone(), |_| {});
7970
7971    let mut saw_five_byte_match = false;
7972    let mut saw_longer_match = false;
7973    matcher.start_matching(|seq| {
7974        if let Sequence::Triple {
7975            offset, match_len, ..
7976        } = seq
7977        {
7978            if offset == 28 && match_len == 5 {
7979                saw_five_byte_match = true;
7980            } else if offset == 28 && match_len > 5 {
7981                saw_longer_match = true;
7982            }
7983        }
7984    });
7985
7986    assert!(
7987        saw_five_byte_match,
7988        "dfast must accept the exact-5-byte match — a 6-byte floor would skip it"
7989    );
7990    assert!(
7991        !saw_longer_match,
7992        "fixture pinned to length 5 — byte 33 ('F') must terminate the extension"
7993    );
7994}
7995
7996#[test]
7997fn driver_switches_backends_and_initializes_dfast_via_reset() {
7998    let mut driver = MatchGeneratorDriver::new(32, 2);
7999
8000    driver.reset(CompressionLevel::Default);
8001    assert_eq!(driver.active_backend(), super::strategy::BackendTag::Dfast);
8002    assert_eq!(driver.window_size(), (1u64 << 21));
8003
8004    let mut first = driver.get_next_space();
8005    first[..12].copy_from_slice(b"abcabcabcabc");
8006    first.truncate(12);
8007    driver.commit_space(first);
8008    assert_eq!(driver.get_last_space(), b"abcabcabcabc");
8009    driver.skip_matching_with_hint(None);
8010
8011    let mut second = driver.get_next_space();
8012    second[..12].copy_from_slice(b"abcabcabcabc");
8013    second.truncate(12);
8014    driver.commit_space(second);
8015
8016    let mut reconstructed = b"abcabcabcabc".to_vec();
8017    driver.start_matching(|seq| match seq {
8018        Sequence::Literals { literals } => reconstructed.extend_from_slice(literals),
8019        Sequence::Triple {
8020            literals,
8021            offset,
8022            match_len,
8023        } => {
8024            reconstructed.extend_from_slice(literals);
8025            let start = reconstructed.len() - offset;
8026            for i in 0..match_len {
8027                let byte = reconstructed[start + i];
8028                reconstructed.push(byte);
8029            }
8030        }
8031    });
8032    assert_eq!(reconstructed, b"abcabcabcabcabcabcabcabc");
8033
8034    driver.reset(CompressionLevel::Fastest);
8035    assert_eq!(driver.window_size(), (1u64 << 19));
8036}
8037
8038#[test]
8039fn driver_level5_selects_row_backend() {
8040    let mut driver = MatchGeneratorDriver::new(32, 2);
8041    driver.reset(CompressionLevel::Level(5));
8042    assert_eq!(driver.active_backend(), super::strategy::BackendTag::Row);
8043    // Greedy-specific routing assertion: `MatchGeneratorDriver::start_matching`
8044    // dispatches the Row backend into `start_matching_greedy` iff
8045    // `self.parse == ParseMode::Greedy`, so assert that actual selector —
8046    // round-trip alone passes on the lazy parser too. `row_matcher().lazy_depth`
8047    // is a secondary corroboration of the same routing decision (a mirror of
8048    // the parse mode); checking `parse` directly catches a regression even if
8049    // the two ever drift apart.
8050    assert_eq!(
8051        driver.parse,
8052        super::strategy::ParseMode::Greedy,
8053        "L5 must route to start_matching_greedy (parse == Greedy)",
8054    );
8055    assert_eq!(
8056        driver.row_matcher().lazy_depth,
8057        0,
8058        "row matcher lazy_depth must mirror the greedy parse mode",
8059    );
8060}
8061
8062/// Level 4 maps to `StrategyTag::Dfast` (the greedy double-fast, upstream zstd
8063/// `ZSTD_dfast` — "greedy" is the parse discipline, not the Row/Greedy
8064/// strategy at Level 5). Round-trip alone doesn't pin match quality (a lazy
8065/// parser would also reconstruct the input correctly), so this test guards the
8066/// parse output itself: a small repeating pattern must produce at least one
8067/// `Sequence::Triple`, so a future regression that emits literals-only (e.g. a
8068/// `min_match` or rep-probe guard regression) is caught.
8069#[test]
8070fn driver_level4_greedy_round_trip_single_slice() {
8071    let mut driver = MatchGeneratorDriver::new(64, 2);
8072    driver.reset(CompressionLevel::Level(4));
8073    let input = b"abcdefgh_abcdefgh_abcdefgh_abcdefgh";
8074    let mut space = driver.get_next_space();
8075    space[..input.len()].copy_from_slice(input);
8076    space.truncate(input.len());
8077    driver.commit_space(space);
8078
8079    let mut reconstructed: Vec<u8> = Vec::new();
8080    let mut saw_triple = false;
8081    driver.start_matching(|seq| match seq {
8082        Sequence::Literals { literals } => reconstructed.extend_from_slice(literals),
8083        Sequence::Triple {
8084            literals,
8085            offset,
8086            match_len,
8087        } => {
8088            saw_triple = true;
8089            reconstructed.extend_from_slice(literals);
8090            let start = reconstructed.len() - offset;
8091            for i in 0..match_len {
8092                let byte = reconstructed[start + i];
8093                reconstructed.push(byte);
8094            }
8095        }
8096    });
8097    assert_eq!(
8098        reconstructed,
8099        input.to_vec(),
8100        "L4 greedy parse failed to reconstruct repeating-pattern input",
8101    );
8102    assert!(
8103        saw_triple,
8104        "L4 greedy parse on a repeating pattern must emit at least one match (Triple)",
8105    );
8106}
8107
8108#[test]
8109fn driver_level4_greedy_round_trip_cross_slice() {
8110    // Verifies that the greedy parse carries repcode / hash-table state
8111    // across slice boundaries: the second slice repeats the first byte
8112    // for byte, so the parse must pick up matches reaching back into
8113    // the previous slice's history.
8114    let mut driver = MatchGeneratorDriver::new(32, 4);
8115    driver.reset(CompressionLevel::Level(4));
8116    let chunk = b"the quick brown fox jumps over!!";
8117    assert_eq!(chunk.len(), 32);
8118
8119    let mut first = driver.get_next_space();
8120    first[..chunk.len()].copy_from_slice(chunk);
8121    first.truncate(chunk.len());
8122    driver.commit_space(first);
8123
8124    let mut first_recon: Vec<u8> = Vec::new();
8125    driver.start_matching(|seq| match seq {
8126        Sequence::Literals { literals } => first_recon.extend_from_slice(literals),
8127        Sequence::Triple {
8128            literals,
8129            offset,
8130            match_len,
8131        } => {
8132            first_recon.extend_from_slice(literals);
8133            let start = first_recon.len() - offset;
8134            for i in 0..match_len {
8135                let byte = first_recon[start + i];
8136                first_recon.push(byte);
8137            }
8138        }
8139    });
8140    assert_eq!(
8141        first_recon,
8142        chunk.to_vec(),
8143        "first slice failed to round-trip"
8144    );
8145
8146    let mut second = driver.get_next_space();
8147    second[..chunk.len()].copy_from_slice(chunk);
8148    second.truncate(chunk.len());
8149    driver.commit_space(second);
8150
8151    let mut full = first_recon.clone();
8152    let mut saw_cross_slice_match = false;
8153    driver.start_matching(|seq| match seq {
8154        Sequence::Literals { literals } => full.extend_from_slice(literals),
8155        Sequence::Triple {
8156            literals,
8157            offset,
8158            match_len,
8159        } => {
8160            // A match whose offset reaches >= the current slice's literal
8161            // run plus the second slice's index means we matched into the
8162            // first slice — exactly the cross-slice behavior under test.
8163            if offset >= chunk.len() {
8164                saw_cross_slice_match = true;
8165            }
8166            full.extend_from_slice(literals);
8167            let start = full.len() - offset;
8168            for i in 0..match_len {
8169                let byte = full[start + i];
8170                full.push(byte);
8171            }
8172        }
8173    });
8174    let mut expected = chunk.to_vec();
8175    expected.extend_from_slice(chunk);
8176    assert_eq!(
8177        full, expected,
8178        "cross-slice L4 greedy parse failed to reconstruct"
8179    );
8180    assert!(
8181        saw_cross_slice_match,
8182        "L4 greedy parse must match across slice boundaries (history is shared)",
8183    );
8184}
8185
8186/// Helper: round-trip `data` through the L4 greedy parse and assert
8187/// the reconstructed bytes match. Returns `(triple_count, max_offset)`
8188/// so callers can probe parse shape (matches emitted, max-offset).
8189#[cfg(test)]
8190impl MatchGeneratorDriver {
8191    /// Test-only: stage a parse×search recipe override applied on the
8192    /// next `reset()`. Routes a level through a non-default (parse,
8193    /// search) pair so the decoupling can be exercised end-to-end.
8194    pub(crate) fn set_config_override(
8195        &mut self,
8196        search: super::strategy::SearchMethod,
8197        parse: super::strategy::ParseMode,
8198    ) {
8199        self.config_override = Some((search, parse));
8200    }
8201
8202    /// Test-only: reset `level` routed onto the lazy HashChain pairing.
8203    /// The lazy band runs on the Row backend in production, so HC-specific
8204    /// behaviour (live-chain dict prime, eviction budget accounting, seed
8205    /// pass gates) is exercised through this override-backed reset.
8206    pub(crate) fn reset_on_hc_lazy(&mut self, level: CompressionLevel) {
8207        self.set_config_override(
8208            super::strategy::SearchMethod::HashChain,
8209            super::strategy::ParseMode::Lazy2,
8210        );
8211        self.reset(level);
8212    }
8213}
8214
8215/// Drive a full compress parse for `data` at `level` (optionally with a
8216/// parse×search override) and reconstruct the bytes from the emitted
8217/// sequences. The returned buffer must equal `data` for a correct parse.
8218#[cfg(test)]
8219fn drive_roundtrip_with_override(
8220    level: CompressionLevel,
8221    over: Option<(super::strategy::SearchMethod, super::strategy::ParseMode)>,
8222    data: &[u8],
8223) -> Vec<u8> {
8224    let mut driver = MatchGeneratorDriver::new(1 << 17, 8);
8225    if let Some((s, p)) = over {
8226        driver.set_config_override(s, p);
8227    }
8228    driver.reset(level);
8229
8230    let mut out: Vec<u8> = Vec::with_capacity(data.len());
8231    let mut offset_in_data = 0usize;
8232    while offset_in_data < data.len() {
8233        let mut space = driver.get_next_space();
8234        let take = (data.len() - offset_in_data).min(space.len());
8235        space[..take].copy_from_slice(&data[offset_in_data..offset_in_data + take]);
8236        space.truncate(take);
8237        driver.commit_space(space);
8238        offset_in_data += take;
8239
8240        driver.start_matching(|seq| match seq {
8241            Sequence::Literals { literals } => out.extend_from_slice(literals),
8242            Sequence::Triple {
8243                literals,
8244                offset,
8245                match_len,
8246            } => {
8247                out.extend_from_slice(literals);
8248                let start = out.len() - offset;
8249                for i in 0..match_len {
8250                    let byte = out[start + i];
8251                    out.push(byte);
8252                }
8253            }
8254        });
8255    }
8256    out
8257}
8258
8259/// Phase 1 capability proof: parse and search are decoupled, so a level
8260/// can run any parse mode on any non-opt search backend. Greedy-on-
8261/// HashChain and Lazy2-on-RowHash are pairings the legacy `strategy_tag`
8262/// could not express; both must reconstruct the input exactly.
8263#[test]
8264fn parse_search_matrix_decoupled_roundtrips() {
8265    use super::strategy::{ParseMode, SearchMethod};
8266    // Mixed repetitive + literal payload that exercises matches and reps.
8267    let mut data = Vec::new();
8268    for i in 0..4000u32 {
8269        data.extend_from_slice(b"the quick brown fox ");
8270        data.extend_from_slice(&i.to_le_bytes());
8271    }
8272
8273    // Greedy parse on the HashChain search backend (legacy: Greedy was
8274    // welded to RowHash).
8275    let got = drive_roundtrip_with_override(
8276        CompressionLevel::Level(5),
8277        Some((SearchMethod::HashChain, ParseMode::Greedy)),
8278        &data,
8279    );
8280    assert_eq!(got, data, "greedy-on-hashchain diverged");
8281
8282    // Lazy2 parse on the RowHash search backend (legacy: Lazy was welded
8283    // to HashChain).
8284    let got = drive_roundtrip_with_override(
8285        CompressionLevel::Level(8),
8286        Some((SearchMethod::RowHash, ParseMode::Lazy2)),
8287        &data,
8288    );
8289    assert_eq!(got, data, "lazy2-on-rowhash diverged");
8290
8291    // Lazy on RowHash too (depth 1).
8292    let got = drive_roundtrip_with_override(
8293        CompressionLevel::Level(6),
8294        Some((SearchMethod::RowHash, ParseMode::Lazy)),
8295        &data,
8296    );
8297    assert_eq!(got, data, "lazy-on-rowhash diverged");
8298}
8299
8300/// The row `mls` knob (C-like `minMatch`) is respected: every accepted
8301/// match (regular row + repcode, on the lazy parse) is at least `mls`
8302/// bytes, and the stream still round-trips for the whole 4..=7 range. The
8303/// default (5) reproduces the historical `ROW_MIN_MATCH_LEN` behaviour.
8304#[test]
8305fn row_mls_knob_gates_matches_and_roundtrips() {
8306    let data: Vec<u8> = (0..4000u32)
8307        .flat_map(|i| {
8308            let mut v = b"abcdefgh".to_vec();
8309            v.extend_from_slice(&i.to_le_bytes());
8310            v
8311        })
8312        .collect();
8313
8314    for mls in [4usize, 5, 6, 7] {
8315        let mut matcher = RowMatchGenerator::new(1 << 22);
8316        let mut cfg = ROW_CONFIG;
8317        cfg.mls = mls;
8318        matcher.configure(cfg);
8319        matcher.add_data(data.clone(), |_| {});
8320
8321        let mut out: Vec<u8> = Vec::with_capacity(data.len());
8322        let mut shortest_match = usize::MAX;
8323        matcher.start_matching(|seq| match seq {
8324            Sequence::Literals { literals } => out.extend_from_slice(literals),
8325            Sequence::Triple {
8326                literals,
8327                offset,
8328                match_len,
8329            } => {
8330                out.extend_from_slice(literals);
8331                shortest_match = shortest_match.min(match_len);
8332                let start = out.len() - offset;
8333                for i in 0..match_len {
8334                    let byte = out[start + i];
8335                    out.push(byte);
8336                }
8337            }
8338        });
8339
8340        assert_eq!(out, data, "mls={mls} round-trip diverged");
8341        if shortest_match != usize::MAX {
8342            assert!(
8343                shortest_match >= mls,
8344                "mls={mls}: emitted a {shortest_match}-byte match below the floor",
8345            );
8346        }
8347    }
8348}
8349
8350/// `LevelParams::parse()` derives the parse mode from the `search` axis, not
8351/// the strategy tag, so the decoupling holds even for a `Bt*`-tagged level
8352/// overridden to a non-BT search backend. Pre-fix the method matched on
8353/// `strategy_tag` and returned `Optimal` for any `Bt*` tag regardless of
8354/// `search`/`lazy_depth`.
8355#[test]
8356fn parse_mode_follows_search_axis_not_strategy_tag() {
8357    use super::strategy::{ParseMode, SearchMethod};
8358    // LEVEL_TABLE[15] is level 16: BtOpt tag, BinaryTree search.
8359    let mut p = LEVEL_TABLE[15];
8360    assert_eq!(p.parse(), ParseMode::Optimal, "BinaryTree search → Optimal");
8361    // Override the Bt-tagged level's search to a non-BT backend: parse must
8362    // follow the search axis (derive from lazy_depth), not stay Optimal.
8363    p.search = SearchMethod::RowHash;
8364    p.lazy_depth = 0;
8365    assert_eq!(p.parse(), ParseMode::Greedy, "RowHash + depth 0 → Greedy");
8366    p.lazy_depth = 2;
8367    assert_eq!(p.parse(), ParseMode::Lazy2, "RowHash + depth 2 → Lazy2");
8368}
8369
8370/// The test-only `config_override` is consumed by the first `reset()` (one
8371/// shot), so a reused driver does not silently keep the synthetic pairing
8372/// armed across later resets. Pre-fix `reset()` copied the override and left
8373/// it set.
8374#[test]
8375fn config_override_is_consumed_by_reset() {
8376    use super::strategy::{ParseMode, SearchMethod};
8377    let mut driver = MatchGeneratorDriver::new(1 << 17, 8);
8378    driver.set_config_override(SearchMethod::RowHash, ParseMode::Lazy2);
8379    assert!(driver.config_override.is_some());
8380    driver.reset(CompressionLevel::Level(5));
8381    assert!(
8382        driver.config_override.is_none(),
8383        "override must be consumed after one reset",
8384    );
8385}
8386
8387// Level 4 maps to the greedy Dfast (double-fast) backend — "greedy" here is the
8388// parse discipline (no lazy lookahead, upstream zstd `ZSTD_dfast`), NOT the Row/Greedy
8389// strategy (which is Level 5). This roundtrip is intentional Dfast L4 coverage;
8390// the Row backend is exercised by the `Level(5)` fixtures elsewhere in this file.
8391#[cfg(test)]
8392fn l4_greedy_round_trip(slice_size: usize, max_slices: usize, data: &[u8]) -> (usize, usize) {
8393    let mut driver = MatchGeneratorDriver::new(slice_size, max_slices);
8394    driver.reset(CompressionLevel::Level(4));
8395
8396    let mut reconstructed: Vec<u8> = Vec::with_capacity(data.len());
8397    let mut triple_count = 0usize;
8398    let mut max_offset = 0usize;
8399
8400    // `start_matching` consumes the current pending slice; multi-slice
8401    // payloads require commit + drive per slice so earlier slices'
8402    // bytes actually round-trip out before they're displaced from the
8403    // window.
8404    let mut offset_in_data = 0usize;
8405    while offset_in_data < data.len() {
8406        let mut space = driver.get_next_space();
8407        let space_cap = space.len();
8408        let take = (data.len() - offset_in_data).min(space_cap);
8409        space[..take].copy_from_slice(&data[offset_in_data..offset_in_data + take]);
8410        space.truncate(take);
8411        driver.commit_space(space);
8412        offset_in_data += take;
8413
8414        driver.start_matching(|seq| match seq {
8415            Sequence::Literals { literals } => reconstructed.extend_from_slice(literals),
8416            Sequence::Triple {
8417                literals,
8418                offset,
8419                match_len,
8420            } => {
8421                triple_count += 1;
8422                if offset > max_offset {
8423                    max_offset = offset;
8424                }
8425                reconstructed.extend_from_slice(literals);
8426                let start = reconstructed.len() - offset;
8427                for i in 0..match_len {
8428                    let byte = reconstructed[start + i];
8429                    reconstructed.push(byte);
8430                }
8431            }
8432        });
8433    }
8434
8435    // Empty payload still needs one commit/drive round so the empty-
8436    // input path of `start_matching_greedy` (the `current_len == 0`
8437    // early-return guard) gets exercised.
8438    if data.is_empty() {
8439        let mut space = driver.get_next_space();
8440        space.truncate(0);
8441        driver.commit_space(space);
8442        driver.start_matching(|seq| match seq {
8443            Sequence::Literals { literals } => reconstructed.extend_from_slice(literals),
8444            Sequence::Triple { .. } => panic!("empty input must not emit any matches"),
8445        });
8446    }
8447
8448    assert_eq!(reconstructed, data, "L4 greedy round-trip diverged");
8449    (triple_count, max_offset)
8450}
8451
8452/// CodeRabbit-flagged tail rep-only case: the previous outer-loop
8453/// guard `pos + ROW_MIN_MATCH_LEN <= current_len` (6) meant the last
8454/// 5-byte position was unreachable. The rep probe at `abs_pos + 1`
8455/// only needs 4 bytes of lookahead beyond the probe point, so the
8456/// guard was relaxed to `pos + GREEDY_MIN_LOOKAHEAD <= current_len`
8457/// (5). This test drives the slices separately and asserts a match
8458/// is emitted **from the second slice's parse pass**, so a future
8459/// regression that re-tightens the guard or breaks the cross-slice
8460/// repcode lookup fails the test instead of being masked by
8461/// first-slice matches.
8462#[test]
8463fn driver_level5_greedy_tail_rep_only_reachable() {
8464    // Period-4 first slice locks rep1 = 4 into `offset_hist` by the
8465    // time the parse reaches the slice tail. Second slice is exactly
8466    // 5 bytes ( = `GREEDY_MIN_LOOKAHEAD`) so the outer loop runs
8467    // **once** at `pos = 0`; the regular `row_candidate` requires 6
8468    // bytes from `abs_pos`, which is past the live history, so the
8469    // only viable hit is the `abs_pos + 1` rep probe. `second[0..]`
8470    // is shaped so the rep probe at `abs_pos + 1` finds a 4-byte
8471    // match at offset 4 (`second[1..5] == first[13..16] ++ second[0]
8472    // == "BCDA"`), and `extend_backwards_shared` then absorbs
8473    // `second[0]` into the match (extending one byte back into the
8474    // implicit anchor, no further because anchor itself is the
8475    // current `abs_pos`).
8476    let first: &[u8] = b"ABCDABCDABCDABCD"; // 16 bytes — strict period 4
8477    let second: &[u8] = b"ABCDA"; // 5 bytes — exact GREEDY_MIN_LOOKAHEAD
8478    let mut driver = MatchGeneratorDriver::new(16, 2);
8479    driver.reset(CompressionLevel::Level(5));
8480
8481    let mut first_space = driver.get_next_space();
8482    first_space[..first.len()].copy_from_slice(first);
8483    first_space.truncate(first.len());
8484    driver.commit_space(first_space);
8485    driver.start_matching(|_| {});
8486
8487    let mut second_space = driver.get_next_space();
8488    second_space[..second.len()].copy_from_slice(second);
8489    second_space.truncate(second.len());
8490    driver.commit_space(second_space);
8491
8492    let mut second_slice_triples = 0usize;
8493    driver.start_matching(|seq| {
8494        if matches!(seq, Sequence::Triple { .. }) {
8495            second_slice_triples += 1;
8496        }
8497    });
8498
8499    assert!(
8500        second_slice_triples >= 1,
8501        "tail rep-only position must produce a match in the second slice \
8502         (got {second_slice_triples} triples)",
8503    );
8504}
8505
8506#[test]
8507fn driver_level4_greedy_empty_input_emits_nothing() {
8508    // Empty input: no slices committed → no sequences emitted, no
8509    // panic. Exercises the `current_len == 0` early-return guard at
8510    // the top of `start_matching_greedy`.
8511    let mut driver = MatchGeneratorDriver::new(64, 2);
8512    driver.reset(CompressionLevel::Level(4));
8513    // Commit an empty space so the matcher has SOMETHING to start
8514    // matching on (otherwise `start_matching` panics on the
8515    // `window.back()` unwrap — that's a separate path covered by
8516    // existing reset tests).
8517    let mut space = driver.get_next_space();
8518    space.truncate(0);
8519    driver.commit_space(space);
8520    let mut emitted_anything = false;
8521    driver.start_matching(|_| emitted_anything = true);
8522    assert!(!emitted_anything, "empty slice must not emit any sequences",);
8523}
8524
8525#[test]
8526fn driver_level4_greedy_sub_min_lookahead_input() {
8527    // Input shorter than `GREEDY_MIN_LOOKAHEAD = 5` — the outer loop
8528    // never executes a body iteration; the tail literal path must
8529    // still emit the input bytes as a single `Sequence::Literals`.
8530    let data: &[u8] = b"abcd"; // 4 bytes
8531    let (triples, _) = l4_greedy_round_trip(64, 2, data);
8532    assert_eq!(
8533        triples, 0,
8534        "sub-min-lookahead input must not emit any matches (got {triples})",
8535    );
8536}
8537
8538#[test]
8539fn driver_level4_greedy_incompressible_input() {
8540    // Pseudo-random bytes with no exploitable structure — every
8541    // position is a "miss" in both the rep probe and the row
8542    // candidate. Exercises the miss branch + `SKIP_STRENGTH = 10`
8543    // skip-step grow (irrelevant at this size, but the path runs).
8544    let mut data = alloc::vec::Vec::with_capacity(256);
8545    let mut x: u32 = 0xDEAD_BEEF;
8546    for _ in 0..256 {
8547        x = x.wrapping_mul(1_103_515_245).wrapping_add(12345);
8548        data.push((x >> 16) as u8);
8549    }
8550    let (_triples, _) = l4_greedy_round_trip(64, 8, &data);
8551    // No structural assertion — the test passes if round-trip is
8552    // bit-exact and no panic / debug_assert fires.
8553}
8554
8555#[test]
8556fn driver_level4_greedy_long_literal_run_skip_step_growth() {
8557    // 2 KiB of unstructured bytes drives the literal-run length past
8558    // the `SKIP_STRENGTH = 10` threshold (~1 KiB), so the miss branch
8559    // + per-miss step-grow path in `start_matching_greedy` is
8560    // exercised. This test is a stress smoke — it only asserts
8561    // bit-exact round-trip + no panic / `debug_assert!` fires; it
8562    // does NOT pin the `SKIP_STRENGTH` constant or the per-iteration
8563    // step count (round-trip would still pass on `SKIP_STRENGTH = 6`
8564    // or `= 14` since both produce valid sequences). Pinning the
8565    // exact step growth would require returning step / iteration
8566    // metadata from the parse, which is invasive plumbing for a
8567    // constant that hasn't been re-tuned in months. The value of
8568    // this test is catching panics or correctness regressions on
8569    // long incompressible runs, which is what its existing
8570    // round-trip assertion checks.
8571    let mut data = alloc::vec::Vec::with_capacity(2048);
8572    let mut x: u32 = 0xC0FF_EE00;
8573    for _ in 0..2048 {
8574        x = x.wrapping_mul(0x9E37_79B9).wrapping_add(0xCAFEBABE);
8575        data.push((x >> 24) as u8);
8576    }
8577    let (_triples, _) = l4_greedy_round_trip(512, 8, &data);
8578}
8579
8580#[test]
8581fn driver_level4_greedy_all_zeros_heavy_rep1() {
8582    // All zeros: every position after the first byte has `byte[pos]
8583    // == byte[pos - 1]`, so the rep1 probe at `abs_pos + 1` hits
8584    // immediately and the parse collapses to a single long match.
8585    // Exercises the `cheap rep at +1, full-match length` path.
8586    let data: Vec<u8> = alloc::vec![0u8; 128];
8587    let (triples, max_offset) = l4_greedy_round_trip(64, 8, &data);
8588    assert!(
8589        triples >= 1,
8590        "all-zeros input must produce at least one rep1 match",
8591    );
8592    // The dominant match should reference rep1 (offset 1), since
8593    // every byte at pos matches pos-1. A larger offset would
8594    // indicate the rep1 probe was bypassed.
8595    assert_eq!(
8596        max_offset, 1,
8597        "all-zeros L4 greedy parse should commit at offset 1 (got {max_offset})",
8598    );
8599}
8600
8601/// Periodic-pattern payload covers the steady-state rep-cascade path
8602/// of the greedy parse — the main-loop rep probe at `abs_pos + 1`
8603/// fires every iteration once the period is locked into
8604/// `offset_hist[0]`, and the parse emits a long chain of triples at
8605/// the same offset.
8606#[test]
8607fn driver_level4_greedy_periodic_pattern_rep_cascade() {
8608    let unit: &[u8] = b"alpha_beta_gamma";
8609    assert_eq!(unit.len(), 16);
8610    let mut data: Vec<u8> = Vec::with_capacity(unit.len() * 32);
8611    for _ in 0..32 {
8612        data.extend_from_slice(unit);
8613    }
8614    let (triples, max_offset) = l4_greedy_round_trip(64, 16, &data);
8615    assert!(
8616        triples >= 1,
8617        "periodic 16-byte payload must emit matches (got {triples})",
8618    );
8619    assert!(
8620        max_offset >= 16,
8621        "periodic 16-byte payload must produce at least one offset >= 16 \
8622         (got max_offset = {max_offset})",
8623    );
8624}
8625
8626#[test]
8627fn driver_reset_keeps_strategy_tag_in_sync_with_active_backend() {
8628    use super::strategy::StrategyTag;
8629
8630    fn check(level: CompressionLevel, expected: StrategyTag) {
8631        let mut driver = MatchGeneratorDriver::new(32, 2);
8632        driver.reset(level);
8633        assert_eq!(
8634            driver.strategy_tag, expected,
8635            "strategy_tag wrong for {level:?}"
8636        );
8637        assert_eq!(
8638            driver.strategy_tag.backend(),
8639            driver.active_backend(),
8640            "strategy_tag backend disagrees with active_backend for {level:?}"
8641        );
8642    }
8643
8644    check(CompressionLevel::Level(1), StrategyTag::Fast);
8645    check(CompressionLevel::Level(2), StrategyTag::Fast);
8646    check(CompressionLevel::Level(3), StrategyTag::Dfast);
8647    check(CompressionLevel::Level(4), StrategyTag::Dfast);
8648    check(CompressionLevel::Level(5), StrategyTag::Greedy);
8649    check(CompressionLevel::Level(7), StrategyTag::Lazy);
8650    check(CompressionLevel::Level(12), StrategyTag::Lazy);
8651    check(CompressionLevel::Level(13), StrategyTag::Btlazy2);
8652    check(CompressionLevel::Level(14), StrategyTag::Btlazy2);
8653    check(CompressionLevel::Level(15), StrategyTag::Btlazy2);
8654    check(CompressionLevel::Level(16), StrategyTag::BtOpt);
8655    check(CompressionLevel::Level(18), StrategyTag::BtUltra);
8656    check(CompressionLevel::Level(22), StrategyTag::BtUltra2);
8657    check(CompressionLevel::Fastest, StrategyTag::Fast);
8658    check(CompressionLevel::Default, StrategyTag::Dfast);
8659    check(CompressionLevel::Better, StrategyTag::Lazy);
8660    // `Best` sits on level 13 (the first dominant point of the deep band).
8661    check(CompressionLevel::Best, StrategyTag::Btlazy2);
8662}
8663
8664#[test]
8665fn level_16_17_map_to_btopt_strategy() {
8666    use super::strategy::{BackendTag, StrategyTag};
8667    let p16 = resolve_level_params(CompressionLevel::Level(16), None);
8668    let p17 = resolve_level_params(CompressionLevel::Level(17), None);
8669    assert_eq!(p16.backend(), BackendTag::HashChain);
8670    assert_eq!(p17.backend(), BackendTag::HashChain);
8671    assert_eq!(StrategyTag::for_level(16), StrategyTag::BtOpt);
8672    assert_eq!(StrategyTag::for_level(17), StrategyTag::BtOpt);
8673}
8674
8675#[test]
8676fn level_18_maps_to_btultra_level_19_to_btultra2_strategy() {
8677    use super::strategy::{BackendTag, StrategyTag};
8678    // Upstream zstd `clevels.h` (srcSize > 256 KiB tier): level 18 = `ZSTD_btultra`,
8679    // level 19 = `ZSTD_btultra2`. Level 19 was previously mapped to plain
8680    // btultra, which under-searched (searchLog 6 vs 7) and lost ~3.7% ratio
8681    // on the repo corpus.
8682    let p18 = resolve_level_params(CompressionLevel::Level(18), None);
8683    let p19 = resolve_level_params(CompressionLevel::Level(19), None);
8684    assert_eq!(p18.backend(), BackendTag::HashChain);
8685    assert_eq!(p19.backend(), BackendTag::HashChain);
8686    assert_eq!(StrategyTag::for_level(18), StrategyTag::BtUltra);
8687    assert_eq!(StrategyTag::for_level(19), StrategyTag::BtUltra2);
8688}
8689
8690#[test]
8691fn level_20_22_map_to_btultra2_strategy() {
8692    use super::strategy::{BackendTag, StrategyTag};
8693    for level in 20..=22 {
8694        let params = resolve_level_params(CompressionLevel::Level(level), None);
8695        assert_eq!(params.backend(), BackendTag::HashChain);
8696        assert_eq!(StrategyTag::for_level(level as u8), StrategyTag::BtUltra2);
8697    }
8698}
8699
8700#[test]
8701fn level22_uses_target_length_and_large_input_tables() {
8702    let params = resolve_level_params(CompressionLevel::Level(22), None);
8703    assert_eq!(params.window_log, 27);
8704    let hc = params.hc.unwrap();
8705    assert_eq!(hc.hash_log, 25);
8706    assert_eq!(hc.chain_log, 27);
8707    assert_eq!(hc.search_depth, 1 << 9);
8708    assert_eq!(hc.target_len, 999);
8709}
8710
8711#[test]
8712fn bt_levels_16_to_21_pin_clevels_params() {
8713    // Pins the BT-level (window_log, hash_log, chain_log, search_depth,
8714    // target_len) tuples so the clevels.h alignment cannot silently drift.
8715    // Levels 16-20 mirror upstream `clevels.h` (srcSize > 256 KiB tier,
8716    // search_depth = 1 << searchLog); level 21 intentionally keeps a deeper
8717    // search_depth (512 vs upstream's 128) — it beats C on ratio there and
8718    // the deeper walk is a deliberate ratio-positive divergence.
8719    let expected = [
8720        // (level, window_log, hash_log, chain_log, search_depth, target_len)
8721        (16u8, 22u8, 22usize, 22usize, 32usize, 48usize),
8722        (17, 23, 22, 23, 32, 64),
8723        (18, 23, 22, 23, 64, 64),
8724        (19, 23, 22, 24, 128, 256),
8725        (20, 25, 23, 25, 128, 256),
8726        (21, 26, 24, 24, 512, 256),
8727    ];
8728    for (level, wlog, hlog, clog, sd, tl) in expected {
8729        let p = resolve_level_params(CompressionLevel::Level(level as i32), None);
8730        assert_eq!(p.window_log, wlog, "level {level} window_log");
8731        let hc = p.hc.unwrap();
8732        assert_eq!(hc.hash_log, hlog, "level {level} hash_log");
8733        assert_eq!(hc.chain_log, clog, "level {level} chain_log");
8734        assert_eq!(hc.search_depth, sd, "level {level} search_depth");
8735        assert_eq!(hc.target_len, tl, "level {level} target_len");
8736    }
8737}
8738
8739#[test]
8740fn level22_source_size_hint_uses_btultra2_tiers() {
8741    let p16k = resolve_level_params(CompressionLevel::Level(22), Some(16 * 1024));
8742    assert_eq!(p16k.window_log, 14);
8743    let hc16k = p16k.hc.unwrap();
8744    assert_eq!(hc16k.hash_log, 15);
8745    assert_eq!(hc16k.chain_log, 15);
8746    assert_eq!(hc16k.search_depth, 1 << 10);
8747    assert_eq!(hc16k.target_len, 999);
8748
8749    let p128k = resolve_level_params(CompressionLevel::Level(22), Some(128 * 1024));
8750    assert_eq!(p128k.window_log, 17);
8751    let hc128k = p128k.hc.unwrap();
8752    assert_eq!(hc128k.hash_log, 17);
8753    assert_eq!(hc128k.chain_log, 18);
8754    assert_eq!(hc128k.search_depth, 1 << 11);
8755    assert_eq!(hc128k.target_len, 999);
8756
8757    let p256k = resolve_level_params(CompressionLevel::Level(22), Some(256 * 1024));
8758    assert_eq!(p256k.window_log, 18);
8759    let hc256k = p256k.hc.unwrap();
8760    assert_eq!(hc256k.hash_log, 19);
8761    assert_eq!(hc256k.chain_log, 19);
8762    assert_eq!(hc256k.search_depth, 1 << 13);
8763    assert_eq!(hc256k.target_len, 999);
8764}
8765
8766#[test]
8767fn level22_non_power_of_two_small_source_uses_tier3_params() {
8768    // srcSize 15 027 (<= 16 KB) selects the table[3] btultra2 row; the
8769    // source-size clamp gives windowLog 14 (ceil log2 15027). Pure-Rust
8770    // assertion against the constant tier-3 geometry (no FFI).
8771    let source_size = 15_027u64;
8772    let params = resolve_level_params(CompressionLevel::Level(22), Some(source_size));
8773
8774    let hc = params.hc.unwrap();
8775    assert_eq!(params.window_log, 14);
8776    assert_eq!(hc.chain_log, 15);
8777    assert_eq!(hc.hash_log, 15);
8778    assert_eq!(hc.search_depth, 1 << 10);
8779    assert_eq!(HC_OPT_MIN_MATCH_LEN, 3);
8780    assert_eq!(hc.target_len, 999);
8781}
8782
8783#[test]
8784fn level22_small_source_uses_window_bounded_hash3_log() {
8785    let mut hc = HcMatchGenerator::new(1 << 14);
8786    hc.configure(
8787        BTULTRA2_HC_CONFIG_L22_16K,
8788        super::strategy::StrategyTag::BtUltra2,
8789        14,
8790    );
8791    assert_eq!(hc.table.hash3_log, 14);
8792
8793    hc.configure(
8794        BTULTRA2_HC_CONFIG_L22,
8795        super::strategy::StrategyTag::BtUltra2,
8796        27,
8797    );
8798    assert_eq!(hc.table.hash3_log, HC3_HASH_LOG);
8799}
8800
8801#[test]
8802fn btultra2_seed_pass_initializes_opt_state() {
8803    let mut hc = HcMatchGenerator::new(1 << 20);
8804    hc.configure(
8805        BTULTRA2_HC_CONFIG,
8806        super::strategy::StrategyTag::BtUltra2,
8807        26,
8808    );
8809    let data: Vec<u8> = (0..32 * 1024).map(|i| (i % 251) as u8).collect();
8810    hc.table.add_data(data, |_| {});
8811    hc.start_matching(|_| {});
8812    assert!(
8813        hc.backend.bt_mut().opt_state.lit_length_sum > 0,
8814        "btultra2 first block should seed non-zero sequence statistics"
8815    );
8816    assert!(
8817        hc.backend.bt_mut().opt_state.off_code_sum > 0,
8818        "btultra2 first block should seed offset-code statistics"
8819    );
8820}
8821
8822#[test]
8823fn btultra2_profile_disables_small_offset_handicap() {
8824    // Pre-Phase-3 this test duplicated the profile build with
8825    // `pass2=false` and `pass2=true` since `for_mode` differentiated
8826    // them. With `const_for_strategy::<BtUltra2>()` there is only one
8827    // profile — the upstream zstd `opt2` pricing — so a single binding
8828    // captures the invariant the test is asserting.
8829    let profile = HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra2>();
8830    assert!(
8831        !profile.favor_small_offsets,
8832        "btultra2 should match upstream zstd opt2 offset pricing"
8833    );
8834    assert!(
8835        profile.accurate,
8836        "btultra2 should use upstream zstd opt2 accurate pricing"
8837    );
8838}
8839
8840#[test]
8841fn btultra_profile_keeps_search_depth_budget() {
8842    let p = HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra>();
8843    assert_eq!(
8844        p.max_chain_depth, 64,
8845        "btultra chain-depth budget must match clevels.h level 18 searchLog 6 (1 << 6 = 64)"
8846    );
8847}
8848
8849#[test]
8850fn btopt_profile_keeps_search_depth_budget() {
8851    let p = HcOptimalCostProfile::const_for_strategy::<super::strategy::BtOpt>();
8852    assert_eq!(
8853        p.max_chain_depth, 32,
8854        "btopt should not cap chain depth below upstream zstd btopt search budget"
8855    );
8856}
8857
8858#[test]
8859fn sufficient_match_len_is_clamped_by_target_len() {
8860    let mut hc = HcMatchGenerator::new(1 << 20);
8861    hc.configure(
8862        BTULTRA2_HC_CONFIG,
8863        super::strategy::StrategyTag::BtUltra2,
8864        26,
8865    );
8866    hc.hc.target_len = 13;
8867    let profile = HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra2>();
8868    assert_eq!(hc.hc.sufficient_match_len_for_pass(profile), 13);
8869}
8870
8871#[test]
8872fn opt_modes_use_target_len_as_sufficient_len() {
8873    use super::strategy;
8874    let mut hc = HcMatchGenerator::new(1 << 20);
8875    hc.hc.target_len = 57;
8876    let profiles = [
8877        HcOptimalCostProfile::const_for_strategy::<strategy::BtOpt>(),
8878        HcOptimalCostProfile::const_for_strategy::<strategy::BtUltra>(),
8879        HcOptimalCostProfile::const_for_strategy::<strategy::BtUltra2>(),
8880    ];
8881    for profile in profiles {
8882        assert_eq!(hc.hc.sufficient_match_len_for_pass(profile), 57);
8883    }
8884}
8885
8886#[test]
8887fn sufficient_match_len_is_capped_by_opt_num() {
8888    let mut hc = HcMatchGenerator::new(1 << 20);
8889    hc.hc.target_len = usize::MAX / 2;
8890    let profile = HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra2>();
8891    assert_eq!(hc.hc.sufficient_match_len_for_pass(profile), HC_OPT_NUM - 1);
8892}
8893
8894#[test]
8895#[allow(clippy::borrow_deref_ref)]
8896fn dictionary_entropy_seed_initializes_opt_state_from_tables() {
8897    let mut hc = HcMatchGenerator::new(1 << 20);
8898    hc.configure(
8899        BTULTRA2_HC_CONFIG,
8900        super::strategy::StrategyTag::BtUltra2,
8901        26,
8902    );
8903
8904    let huff = crate::huff0::huff0_encoder::HuffmanTable::build_from_data(
8905        b"aaabbbbccccddddeeeeefffffgggg",
8906    );
8907    let ll = crate::fse::fse_encoder::default_ll_table();
8908    let ml = crate::fse::fse_encoder::default_ml_table();
8909    let of = crate::fse::fse_encoder::default_of_table();
8910    hc.seed_dictionary_entropy(Some(&huff), Some(&*ll), Some(&*ml), Some(&*of));
8911
8912    hc.backend.bt_mut().opt_state.rescale_freqs(
8913        b"abcd",
8914        HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra2>(),
8915    );
8916
8917    let base_ll_freqs: [u32; HC_MAX_LL + 1] = [
8918        4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8919        1, 1, 1, 1, 1, 1,
8920    ];
8921
8922    assert_ne!(
8923        hc.backend.bt_mut().opt_state.lit_length_freq,
8924        base_ll_freqs,
8925        "dictionary entropy should override fallback LL bootstrap frequencies"
8926    );
8927    assert!(
8928        hc.backend
8929            .bt_mut()
8930            .opt_state
8931            .match_length_freq
8932            .iter()
8933            .any(|&v| v != 1),
8934        "dictionary entropy should seed non-uniform ML frequencies"
8935    );
8936    assert_ne!(
8937        hc.backend.bt_mut().opt_state.off_code_freq[0],
8938        6,
8939        "dictionary entropy should override fallback OF bootstrap frequencies"
8940    );
8941}
8942
8943#[test]
8944#[allow(clippy::borrow_deref_ref)]
8945fn dictionary_fse_seed_applies_without_huffman_seed() {
8946    let mut hc = HcMatchGenerator::new(1 << 20);
8947    hc.configure(
8948        BTULTRA2_HC_CONFIG,
8949        super::strategy::StrategyTag::BtUltra2,
8950        26,
8951    );
8952
8953    let ll = crate::fse::fse_encoder::default_ll_table();
8954    let ml = crate::fse::fse_encoder::default_ml_table();
8955    let of = crate::fse::fse_encoder::default_of_table();
8956    hc.seed_dictionary_entropy(None, Some(&*ll), Some(&*ml), Some(&*of));
8957    hc.backend.bt_mut().opt_state.rescale_freqs(
8958        b"abcd",
8959        HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra2>(),
8960    );
8961
8962    let base_ll_freqs: [u32; HC_MAX_LL + 1] = [
8963        4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8964        1, 1, 1, 1, 1, 1,
8965    ];
8966    assert_ne!(
8967        hc.backend.bt_mut().opt_state.lit_length_freq,
8968        base_ll_freqs,
8969        "FSE seed should still override LL bootstrap frequencies without huffman seed"
8970    );
8971    assert!(
8972        hc.backend
8973            .bt_mut()
8974            .opt_state
8975            .match_length_freq
8976            .iter()
8977            .any(|&v| v != 1),
8978        "FSE seed should still seed non-uniform ML frequencies"
8979    );
8980    assert_ne!(
8981        hc.backend.bt_mut().opt_state.off_code_freq[0],
8982        6,
8983        "FSE seed should still override OF bootstrap frequencies without huffman seed"
8984    );
8985}
8986
8987#[test]
8988#[allow(clippy::borrow_deref_ref)]
8989fn dictionary_seed_overrides_predef_price_mode_on_tiny_input() {
8990    let mut hc = HcMatchGenerator::new(1 << 20);
8991    hc.configure(
8992        BTULTRA2_HC_CONFIG,
8993        super::strategy::StrategyTag::BtUltra2,
8994        26,
8995    );
8996
8997    let ll = crate::fse::fse_encoder::default_ll_table();
8998    let ml = crate::fse::fse_encoder::default_ml_table();
8999    let of = crate::fse::fse_encoder::default_of_table();
9000    hc.seed_dictionary_entropy(None, Some(&*ll), Some(&*ml), Some(&*of));
9001    hc.backend.bt_mut().opt_state.rescale_freqs(
9002        b"abc",
9003        HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra2>(),
9004    );
9005    assert!(
9006        matches!(
9007            hc.backend.bt_mut().opt_state.price_type,
9008            HcOptPriceType::Dynamic
9009        ),
9010        "dictionary-seeded first block should stay in dynamic mode even for tiny src"
9011    );
9012}
9013
9014#[test]
9015fn lit_length_price_blocksize_max_costs_one_extra_bit() {
9016    let profile_predef = HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra2>();
9017    let mut stats_predef = HcOptState::new();
9018    stats_predef.price_type = HcOptPriceType::Predefined;
9019    let predef_max = profile_predef.lit_length_price(&stats_predef, HC_BLOCKSIZE_MAX);
9020    let predef_prev =
9021        profile_predef.lit_length_price(&stats_predef, HC_BLOCKSIZE_MAX.saturating_sub(1));
9022    assert_eq!(
9023        predef_max,
9024        predef_prev + HC_BITCOST_MULTIPLIER,
9025        "predefined litLength pricing at BLOCKSIZE_MAX must add exactly one bit"
9026    );
9027
9028    let profile_dyn = HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra2>();
9029    let mut stats_dyn = HcOptState::new();
9030    stats_dyn.price_type = HcOptPriceType::Dynamic;
9031    stats_dyn.lit_length_freq.fill(1);
9032    stats_dyn.lit_length_sum = (HC_MAX_LL + 1) as u32;
9033    stats_dyn.match_length_freq.fill(1);
9034    stats_dyn.match_length_sum = (HC_MAX_ML + 1) as u32;
9035    stats_dyn.off_code_freq.fill(1);
9036    stats_dyn.off_code_sum = (HC_MAX_OFF + 1) as u32;
9037    stats_dyn.lit_freq.fill(1);
9038    stats_dyn.lit_sum = (HC_MAX_LIT + 1) as u32;
9039    stats_dyn.set_base_prices(true);
9040    let dyn_max = profile_dyn.lit_length_price(&stats_dyn, HC_BLOCKSIZE_MAX);
9041    let dyn_prev = profile_dyn.lit_length_price(&stats_dyn, HC_BLOCKSIZE_MAX.saturating_sub(1));
9042    assert_eq!(
9043        dyn_max,
9044        dyn_prev + HC_BITCOST_MULTIPLIER,
9045        "dynamic litLength pricing at BLOCKSIZE_MAX must add exactly one bit"
9046    );
9047}
9048
9049#[test]
9050#[allow(clippy::borrow_deref_ref)]
9051fn btultra2_seed_pass_disabled_when_dictionary_entropy_seed_present() {
9052    let mut hc = HcMatchGenerator::new(1 << 20);
9053    hc.configure(
9054        BTULTRA2_HC_CONFIG,
9055        super::strategy::StrategyTag::BtUltra2,
9056        26,
9057    );
9058    let ll = crate::fse::fse_encoder::default_ll_table();
9059    let ml = crate::fse::fse_encoder::default_ml_table();
9060    let of = crate::fse::fse_encoder::default_of_table();
9061    hc.seed_dictionary_entropy(None, Some(&*ll), Some(&*ml), Some(&*of));
9062    assert!(
9063        !hc.should_run_btultra2_seed_pass::<super::strategy::BtUltra2>(HC_PREDEF_THRESHOLD + 1),
9064        "dictionary-seeded first block should skip btultra2 warmup pass"
9065    );
9066}
9067
9068#[test]
9069fn btultra2_seed_pass_disabled_when_prefix_history_exists() {
9070    let mut hc = HcMatchGenerator::new(1 << 20);
9071    hc.configure(
9072        BTULTRA2_HC_CONFIG,
9073        super::strategy::StrategyTag::BtUltra2,
9074        26,
9075    );
9076    hc.table.history_abs_start = 17;
9077    hc.table.push_test_chunk(b"abcdefghijklmnop".to_vec());
9078    assert!(
9079        !hc.should_run_btultra2_seed_pass::<super::strategy::BtUltra2>(HC_PREDEF_THRESHOLD + 9),
9080        "btultra2 warmup must be first-block only (no prefix history)"
9081    );
9082}
9083
9084#[test]
9085fn btultra2_seed_pass_disabled_for_tiny_block() {
9086    let mut hc = HcMatchGenerator::new(1 << 20);
9087    hc.configure(
9088        BTULTRA2_HC_CONFIG,
9089        super::strategy::StrategyTag::BtUltra2,
9090        26,
9091    );
9092    assert!(
9093        !hc.should_run_btultra2_seed_pass::<super::strategy::BtUltra2>(HC_PREDEF_THRESHOLD),
9094        "btultra2 warmup should not run at or below predefined threshold"
9095    );
9096}
9097
9098#[test]
9099fn btultra2_seed_pass_disabled_after_stats_initialized() {
9100    let mut hc = HcMatchGenerator::new(1 << 20);
9101    hc.configure(
9102        BTULTRA2_HC_CONFIG,
9103        super::strategy::StrategyTag::BtUltra2,
9104        26,
9105    );
9106    hc.backend.bt_mut().opt_state.lit_length_sum = 1;
9107    assert!(
9108        !hc.should_run_btultra2_seed_pass::<super::strategy::BtUltra2>(HC_PREDEF_THRESHOLD + 32),
9109        "btultra2 warmup should run only for first block before stats are initialized"
9110    );
9111}
9112
9113#[test]
9114fn btultra2_seed_pass_disabled_when_not_at_frame_start() {
9115    let mut hc = HcMatchGenerator::new(1 << 20);
9116    hc.configure(
9117        BTULTRA2_HC_CONFIG,
9118        super::strategy::StrategyTag::BtUltra2,
9119        26,
9120    );
9121    // Simulate non-first block state: current block has no prefix in deque,
9122    // but total produced window already includes prior output.
9123    hc.table.window_size = HC_PREDEF_THRESHOLD + 64;
9124    // window_size set manually above to simulate prior output; record the
9125    // current block as one live chunk (seed-pass check reads lengths, not bytes).
9126    hc.table.chunk_lens.push_back(HC_PREDEF_THRESHOLD + 32);
9127    assert!(
9128        !hc.should_run_btultra2_seed_pass::<super::strategy::BtUltra2>(HC_PREDEF_THRESHOLD + 32),
9129        "btultra2 warmup must not run after frame start"
9130    );
9131}
9132
9133#[test]
9134fn btultra2_seed_pass_disabled_when_ldm_sequences_exist() {
9135    let mut hc = HcMatchGenerator::new(1 << 20);
9136    hc.configure(
9137        BTULTRA2_HC_CONFIG,
9138        super::strategy::StrategyTag::BtUltra2,
9139        26,
9140    );
9141    hc.table.window_size = HC_PREDEF_THRESHOLD + 64;
9142    hc.table.chunk_lens.push_back(HC_PREDEF_THRESHOLD + 64);
9143    hc.backend.bt_mut().ldm_sequences.push(HcRawSeq {
9144        lit_length: 8,
9145        offset: 16,
9146        match_length: 32,
9147    });
9148    assert!(
9149        !hc.should_run_btultra2_seed_pass::<super::strategy::BtUltra2>(HC_PREDEF_THRESHOLD + 32),
9150        "btultra2 warmup must not run when LDM already produced sequences"
9151    );
9152}
9153
9154#[test]
9155fn literal_price_uses_eight_bits_when_literals_uncompressed() {
9156    let profile = HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra2>();
9157    let mut stats = HcOptState::new();
9158    stats.set_literals_compressed_for_tests(false);
9159    stats.price_type = HcOptPriceType::Predefined;
9160    assert_eq!(
9161        profile.literal_price(&stats, b'a'),
9162        8 * HC_BITCOST_MULTIPLIER,
9163        "uncompressed literals should cost 8 bits regardless of price mode"
9164    );
9165}
9166
9167#[test]
9168fn update_stats_skips_literal_frequencies_when_uncompressed() {
9169    let mut stats = HcOptState::new();
9170    stats.set_literals_compressed_for_tests(false);
9171    stats.update_stats(3, b"abc", 4, 8);
9172    assert_eq!(
9173        stats.lit_sum, 0,
9174        "literal sum must remain unchanged when literal compression is disabled"
9175    );
9176    assert_eq!(
9177        stats.lit_freq.iter().copied().sum::<u32>(),
9178        0,
9179        "literal frequencies must not be updated when literal compression is disabled"
9180    );
9181    assert_eq!(
9182        stats.lit_length_sum, 1,
9183        "literal-length stats still update for sequence modeling"
9184    );
9185    assert_eq!(
9186        stats.match_length_sum, 1,
9187        "match-length stats still update for sequence modeling"
9188    );
9189    assert_eq!(
9190        stats.off_code_sum, 1,
9191        "offset-code stats still update for sequence modeling"
9192    );
9193}
9194
9195#[test]
9196#[allow(clippy::borrow_deref_ref)]
9197fn dictionary_huffman_seed_ignored_when_literals_uncompressed() {
9198    let mut stats = HcOptState::new();
9199    stats.set_literals_compressed_for_tests(false);
9200    let huff = crate::huff0::huff0_encoder::HuffmanTable::build_from_data(
9201        b"aaaaabbbbcccddeeff00112233445566778899",
9202    );
9203    let ll = crate::fse::fse_encoder::default_ll_table();
9204    let ml = crate::fse::fse_encoder::default_ml_table();
9205    let of = crate::fse::fse_encoder::default_of_table();
9206    stats.seed_dictionary_entropy(Some(&huff), Some(&*ll), Some(&*ml), Some(&*of));
9207    stats.rescale_freqs(
9208        b"abcd",
9209        HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra2>(),
9210    );
9211    assert_eq!(
9212        stats.lit_sum, 0,
9213        "literal sum must stay zero when literals are uncompressed"
9214    );
9215    assert_eq!(
9216        stats.lit_freq.iter().copied().sum::<u32>(),
9217        0,
9218        "literal frequencies must ignore dictionary huffman seed when uncompressed"
9219    );
9220}
9221
9222#[test]
9223fn hc_repcode_candidates_respect_litlen_dependent_rep_order() {
9224    let mut hc = HcMatchGenerator::new(64);
9225    hc.table.history = b"xxxxxxABCDEFABCDEF".to_vec();
9226    hc.table.history_start = 0;
9227    hc.table.history_abs_start = 0;
9228
9229    let abs_pos = 12usize; // points at second "ABCDEF"
9230    let current_abs_end = hc.table.history.len();
9231    let reps = [6u32, 3u32, 9u32];
9232
9233    let mut lit_pos_candidates = Vec::new();
9234    hc.hc.for_each_repcode_candidate_with_reps(
9235        &hc.table,
9236        abs_pos,
9237        1,
9238        reps,
9239        current_abs_end,
9240        HC_OPT_MIN_MATCH_LEN,
9241        |c| {
9242            lit_pos_candidates.push(c.offset);
9243        },
9244    );
9245    assert!(
9246        lit_pos_candidates.contains(&6),
9247        "when lit_len>0, rep0 should be considered and match"
9248    );
9249
9250    let mut ll0_candidates = Vec::new();
9251    hc.hc.for_each_repcode_candidate_with_reps(
9252        &hc.table,
9253        abs_pos,
9254        0,
9255        reps,
9256        current_abs_end,
9257        HC_OPT_MIN_MATCH_LEN,
9258        |c| {
9259            ll0_candidates.push(c.offset);
9260        },
9261    );
9262    assert!(
9263        !ll0_candidates.contains(&6),
9264        "when lit_len==0, rep0 is not directly eligible (ll0 semantics)"
9265    );
9266}
9267
9268#[test]
9269fn hc_collect_optimal_candidates_keeps_reps_when_chain_depth_zero() {
9270    let mut hc = HcMatchGenerator::new(64);
9271    hc.hc.search_depth = 0;
9272    hc.table.history = b"xyzxyzxyzxyz".to_vec();
9273    hc.table.history_start = 0;
9274    hc.table.history_abs_start = 0;
9275
9276    let abs_pos = 6usize;
9277    let current_abs_end = hc.table.history.len();
9278    let profile = HcOptimalCostProfile {
9279        max_chain_depth: 0,
9280        sufficient_match_len: usize::MAX / 2,
9281        accurate: false,
9282        favor_small_offsets: false,
9283    };
9284    let mut out = Vec::new();
9285    hc.collect_optimal_candidates(
9286        abs_pos,
9287        current_abs_end,
9288        profile,
9289        HcCandidateQuery {
9290            reps: [3, 6, 9],
9291            lit_len: 1,
9292            ldm_candidate: None,
9293        },
9294        &mut out,
9295    );
9296    assert!(
9297        !out.is_empty(),
9298        "rep candidates should remain available even when chain depth is zero"
9299    );
9300    assert!(
9301        out.iter().any(|c| c.offset == 3),
9302        "rep0 candidate should be retained"
9303    );
9304}
9305
9306#[test]
9307fn hc_collect_optimal_candidates_rep_tail_match_skips_chain_probe() {
9308    let mut hc = HcMatchGenerator::new(64);
9309    hc.table.history = b"aaaaaaaaaa".to_vec();
9310    hc.table.history_start = 0;
9311    hc.table.history_abs_start = 0;
9312    hc.table.position_base = 0;
9313    hc.hc.search_depth = 32;
9314    let abs_pos = 6usize;
9315    hc.table.ensure_tables();
9316    hc.table.insert_positions(0, abs_pos);
9317
9318    let profile = HcOptimalCostProfile {
9319        max_chain_depth: 32,
9320        sufficient_match_len: usize::MAX / 2,
9321        accurate: true,
9322        favor_small_offsets: false,
9323    };
9324    let mut out = Vec::new();
9325    hc.collect_optimal_candidates(
9326        abs_pos,
9327        hc.table.history.len(),
9328        profile,
9329        HcCandidateQuery {
9330            reps: [1, 4, 8],
9331            lit_len: 1,
9332            ldm_candidate: None,
9333        },
9334        &mut out,
9335    );
9336
9337    assert!(
9338        out.iter()
9339            .all(|candidate| matches!(candidate.offset, 1 | 4)),
9340        "terminal rep match should return before chain probing adds non-rep offsets"
9341    );
9342}
9343
9344#[test]
9345fn hc_collect_optimal_candidates_long_chain_match_advances_skip_window() {
9346    let mut hc = HcMatchGenerator::new(128);
9347    hc.table.history = b"abcabcabcabcabcabcabcabc".to_vec();
9348    hc.table.history_start = 0;
9349    hc.table.history_abs_start = 0;
9350    hc.table.position_base = 0;
9351    hc.hc.search_depth = 32;
9352    let abs_pos = 9usize;
9353    hc.table.ensure_tables();
9354    hc.table.insert_positions(0, abs_pos);
9355    hc.table.skip_insert_until_abs = 0;
9356
9357    let profile = HcOptimalCostProfile {
9358        max_chain_depth: 32,
9359        sufficient_match_len: usize::MAX / 2,
9360        accurate: true,
9361        favor_small_offsets: false,
9362    };
9363    let mut out = Vec::new();
9364    hc.collect_optimal_candidates(
9365        abs_pos,
9366        hc.table.history.len(),
9367        profile,
9368        HcCandidateQuery {
9369            reps: [1, 4, 8],
9370            lit_len: 1,
9371            ldm_candidate: None,
9372        },
9373        &mut out,
9374    );
9375
9376    assert!(
9377        hc.table.skip_insert_until_abs > abs_pos,
9378        "long chain match should advance skip window to avoid redundant immediate insertions"
9379    );
9380}
9381
9382#[test]
9383fn hc_collect_optimal_candidates_chain_fast_skip_uses_match_end_minus_8() {
9384    let mut hc = HcMatchGenerator::new(128);
9385    hc.table.history = b"abcabcabcabcabcabcabcabc".to_vec();
9386    hc.table.history_start = 0;
9387    hc.table.history_abs_start = 0;
9388    hc.table.position_base = 0;
9389    hc.hc.search_depth = 32;
9390    let abs_pos = 9usize;
9391    hc.table.ensure_tables();
9392    hc.table.insert_positions(0, abs_pos);
9393    hc.table.skip_insert_until_abs = 0;
9394
9395    let profile = HcOptimalCostProfile {
9396        max_chain_depth: 32,
9397        sufficient_match_len: 10,
9398        accurate: true,
9399        favor_small_offsets: false,
9400    };
9401    let mut out = Vec::new();
9402    hc.collect_optimal_candidates(
9403        abs_pos,
9404        hc.table.history.len(),
9405        profile,
9406        HcCandidateQuery {
9407            reps: [1, 4, 8],
9408            lit_len: 1,
9409            ldm_candidate: None,
9410        },
9411        &mut out,
9412    );
9413
9414    let best_match_end = out
9415        .iter()
9416        .map(|candidate| candidate.start.saturating_add(candidate.match_len))
9417        .max()
9418        .expect("expected at least one candidate");
9419    assert!(
9420        hc.table.skip_insert_until_abs > abs_pos,
9421        "chain fast-skip must advance past current position"
9422    );
9423    assert!(
9424        hc.table.skip_insert_until_abs <= best_match_end.saturating_sub(8),
9425        "chain fast-skip must not exceed upstream zstd-style matchEndIdx - 8 bound"
9426    );
9427}
9428
9429#[test]
9430fn hc_collect_optimal_candidates_advances_skip_window_on_plain_bt_path() {
9431    let mut hc = HcMatchGenerator::new(256);
9432    hc.table.history = b"abcdefghijklmnop".to_vec();
9433    hc.table.history_start = 0;
9434    hc.table.history_abs_start = 0;
9435    hc.table.position_base = 0;
9436    hc.hc.search_depth = 0;
9437    hc.table.ensure_tables();
9438
9439    let abs_pos = 8usize;
9440    hc.table.skip_insert_until_abs = 0;
9441
9442    let profile = HcOptimalCostProfile {
9443        max_chain_depth: 0,
9444        sufficient_match_len: usize::MAX / 2,
9445        accurate: true,
9446        favor_small_offsets: false,
9447    };
9448    let mut out = Vec::new();
9449    hc.collect_optimal_candidates(
9450        abs_pos,
9451        hc.table.history.len(),
9452        profile,
9453        HcCandidateQuery {
9454            reps: [1, 4, 8],
9455            lit_len: 1,
9456            ldm_candidate: None,
9457        },
9458        &mut out,
9459    );
9460
9461    assert_eq!(
9462        hc.table.skip_insert_until_abs,
9463        abs_pos.saturating_add(1),
9464        "plain BT path should advance skip window by 1 via upstream zstd matchEndIdx baseline"
9465    );
9466}
9467
9468// Removed: the three `hc_collect_optimal_candidates_*_hash3_*` /
9469// `hc_hash3_tail_match_*` tests forced `search_depth = 0` together
9470// with `hash3_log != 0`, an HC-chain-walker-only fixture state that
9471// production never reaches (hash3 is BtUltra2-only and BtUltra2 always
9472// runs `search_depth = 512`). They depended on the `has_hash3 =>
9473// BtUltra2` escape hatch in the test dispatcher; with that hatch gone
9474// (CR review on PR #123) and the dispatcher routing purely from
9475// `self.strategy_tag`, there is no production-shaped configuration
9476// that reproduces what those tests asserted. The corresponding hash3
9477// invariants are exercised end-to-end by the existing level22 roundtrip
9478// + upstream zstd-parity ratio gate.
9479
9480#[test]
9481fn hc_ldm_candidates_are_merged_into_optimal_candidates() {
9482    let mut hc = HcMatchGenerator::new(512);
9483    hc.table.history = (0..256).map(|i| (i % 251) as u8).collect();
9484    hc.table.history_start = 0;
9485    hc.table.history_abs_start = 0;
9486
9487    let abs_pos = 128usize;
9488    let current_abs_end = 256usize;
9489    let ldm = MatchCandidate {
9490        start: abs_pos,
9491        offset: 96,
9492        match_len: 40,
9493    };
9494
9495    let profile = HcOptimalCostProfile {
9496        max_chain_depth: 0,
9497        sufficient_match_len: usize::MAX / 2,
9498        accurate: true,
9499        favor_small_offsets: false,
9500    };
9501    let mut out = Vec::new();
9502    hc.collect_optimal_candidates(
9503        abs_pos,
9504        current_abs_end,
9505        profile,
9506        HcCandidateQuery {
9507            reps: [1, 4, 8],
9508            lit_len: 1,
9509            ldm_candidate: Some(ldm),
9510        },
9511        &mut out,
9512    );
9513    assert!(
9514        out.iter().any(
9515            |candidate| candidate.offset == ldm.offset && candidate.match_len == ldm.match_len
9516        ),
9517        "LDM candidate should be present in optimal candidate set"
9518    );
9519}
9520
9521#[test]
9522fn btultra_and_btultra2_both_keep_dictionary_candidates() {
9523    // Routes the BtUltra2 / BtUltra fixture through the production
9524    // `configure()` path so derived state (`hash3_log`, `is_btultra2`,
9525    // `uses_bt`, `backend`) stays consistent — manually flipping the
9526    // strategy flags here used to leave `hash3_log` / `hash3_table` in
9527    // the previous mode's shape and trip the
9528    // `Strategy::USE_HASH3 ⇒ hash3_log != 0` debug invariant inside
9529    // `collect_optimal_candidates_initialized_body`.
9530    use super::strategy::StrategyTag;
9531
9532    let test_config = HcConfig {
9533        hash_log: 23,
9534        chain_log: 22,
9535        search_depth: 32,
9536        target_len: 256,
9537        search_mls: 4,
9538    };
9539    let window_log = 20u8;
9540
9541    let prepare_history = |hc: &mut HcMatchGenerator, abs_pos: usize| {
9542        hc.table.history = alloc::vec![0u8; 160];
9543        for i in 0..64 {
9544            hc.table.history[i] = b'a' + (i % 7) as u8;
9545        }
9546        for i in 64..160 {
9547            hc.table.history[i] = b'k' + (i % 5) as u8;
9548        }
9549        for i in 0..24 {
9550            hc.table.history[abs_pos + i] = hc.table.history[16 + i];
9551        }
9552        hc.table.history_start = 0;
9553        hc.table.history_abs_start = 0;
9554        hc.table.position_base = 0;
9555        hc.table.ensure_tables();
9556        hc.table.insert_positions(0, abs_pos);
9557        hc.table.dictionary_limit_abs = Some(64);
9558        hc.table.skip_insert_until_abs = 0;
9559    };
9560
9561    let profile = HcOptimalCostProfile {
9562        max_chain_depth: 32,
9563        sufficient_match_len: usize::MAX / 2,
9564        accurate: true,
9565        favor_small_offsets: false,
9566    };
9567    let abs_pos = 96usize;
9568    let mut out = Vec::new();
9569
9570    let mut hc = HcMatchGenerator::new(256);
9571    hc.configure(test_config, StrategyTag::BtUltra2, window_log);
9572    prepare_history(&mut hc, abs_pos);
9573    hc.collect_optimal_candidates(
9574        abs_pos,
9575        160,
9576        profile,
9577        HcCandidateQuery {
9578            reps: [1, 4, 8],
9579            lit_len: 1,
9580            ldm_candidate: None,
9581        },
9582        &mut out,
9583    );
9584    assert!(
9585        out.iter().any(|candidate| candidate.offset >= 32),
9586        "btultra2 should retain dictionary candidates on upstream zstd-parity path"
9587    );
9588
9589    let mut hc = HcMatchGenerator::new(256);
9590    hc.configure(test_config, StrategyTag::BtUltra, window_log);
9591    prepare_history(&mut hc, abs_pos);
9592    hc.collect_optimal_candidates(
9593        abs_pos,
9594        160,
9595        profile,
9596        HcCandidateQuery {
9597            reps: [1, 4, 8],
9598            lit_len: 1,
9599            ldm_candidate: None,
9600        },
9601        &mut out,
9602    );
9603    assert!(
9604        out.iter().any(|candidate| candidate.offset >= 32),
9605        "btultra should retain dictionary candidates"
9606    );
9607}
9608
9609#[test]
9610fn driver_small_source_hint_shrinks_dfast_hash_tables() {
9611    let mut driver = MatchGeneratorDriver::new(32, 2);
9612
9613    driver.reset(CompressionLevel::Level(3));
9614    let mut space = driver.get_next_space();
9615    space[..12].copy_from_slice(b"abcabcabcabc");
9616    space.truncate(12);
9617    driver.commit_space(space);
9618    driver.skip_matching_with_hint(None);
9619    // Upstream zstd-parity split sizes: long-hash = DFAST_HASH_BITS,
9620    // short-hash = DFAST_HASH_BITS - DFAST_SHORT_HASH_BITS_DELTA.
9621    let full_long = driver.dfast_matcher().long_len();
9622    let full_short = driver.dfast_matcher().short_len();
9623    assert_eq!(full_long, 1 << DFAST_HASH_BITS);
9624    assert_eq!(
9625        full_short,
9626        1 << (DFAST_HASH_BITS - DFAST_SHORT_HASH_BITS_DELTA)
9627    );
9628
9629    driver.set_source_size_hint(1024);
9630    driver.reset(CompressionLevel::Level(3));
9631    let mut space = driver.get_next_space();
9632    space[..12].copy_from_slice(b"xyzxyzxyzxyz");
9633    space.truncate(12);
9634    driver.commit_space(space);
9635    driver.skip_matching_with_hint(None);
9636    let hinted_long = driver.dfast_matcher().long_len();
9637    let hinted_short = driver.dfast_matcher().short_len();
9638
9639    // The wire `window_log` stays at its floor (decoder-interop), but the
9640    // internal dfast tables are sized from the RAW 1 KiB source, not the
9641    // floored window: `table_window = 1 << ceil_log2(1024) = 1 << 10`, so
9642    // both tables land at the `MIN_WINDOW_LOG` floor (the long table at
9643    // `dfast_hash_bits_for_window(1 << 10) = 10`, the short table one
9644    // `DFAST_SHORT_HASH_BITS_DELTA` step below but clamped back up to
9645    // `MIN_WINDOW_LOG`).
9646    assert_eq!(driver.window_size(), 1 << MIN_HINTED_WINDOW_LOG);
9647    assert_eq!(hinted_long, 1 << MIN_WINDOW_LOG);
9648    assert_eq!(hinted_short, 1 << MIN_WINDOW_LOG);
9649    assert!(
9650        hinted_long < full_long && hinted_short < full_short,
9651        "tiny source hint should reduce both dfast tables"
9652    );
9653}
9654
9655#[test]
9656fn driver_huge_source_hint_does_not_overflow_table_window_shift() {
9657    // Regression: the Dfast / Row table-window sizing in `reset` derives a
9658    // shift from `ceil_log2(hint)`. A hint >= 2^63 + 1 makes that shift 64,
9659    // and `1usize << 64` panics in debug / wraps to 0 in release before the
9660    // `.min(max_window_size)` cap can apply. A `u64::MAX` pledged source size
9661    // must size the table to the real window, never panic or wrap to zero.
9662    let mut driver = MatchGeneratorDriver::new(32, 2);
9663    driver.set_source_size_hint(u64::MAX);
9664    driver.reset(CompressionLevel::Level(3));
9665
9666    let mut space = driver.get_next_space();
9667    space[..12].copy_from_slice(b"abcabcabcabc");
9668    space.truncate(12);
9669    driver.commit_space(space);
9670    driver.skip_matching_with_hint(None);
9671
9672    assert!(
9673        driver.dfast_matcher().long_len() >= 1 << MIN_WINDOW_LOG,
9674        "huge hint must size the dfast table from the real window, not wrap to zero"
9675    );
9676}
9677
9678#[test]
9679fn driver_huge_source_hint_with_dict_does_not_overflow_hc_reserve() {
9680    // Regression: the HC/BT history-mirror pre-size adds the dictionary
9681    // hint to the source-size hint before `reserve_history` clamps to the
9682    // window ceiling. A `u64::MAX` pledged source size (the "unknown size"
9683    // sentinel) plus any positive dictionary hint overflows `usize` in
9684    // `(src as usize) + dict_hint` — debug panic / release wrap on 64-bit,
9685    // and `src as usize` truncation on 32-bit targets. Level 16 (BtOpt)
9686    // routes through the HashChain/BT storage arm that owns this reserve.
9687    // Must size the mirror to the real window, never panic, wrap, or
9688    // truncate.
9689    let mut driver = MatchGeneratorDriver::new(32, 2);
9690    driver.set_source_size_hint(u64::MAX);
9691    driver.set_dictionary_size_hint(64 * 1024);
9692    driver.reset(CompressionLevel::Level(16));
9693
9694    // The saturated `usize::MAX` reserve target must be clamped to the HC
9695    // history ceiling, not reserved literally (which would OOM/panic). Level 16
9696    // has window_log 22, so the ceiling is `window + window/4 + one block`
9697    // (the `reserve_history` formula). Assert the reserve actually reached it —
9698    // a no-panic-only check would also pass on an under-reserved mirror.
9699    let window = 1usize << 22;
9700    let expected_history_ceiling = window + (window >> 2) + crate::common::MAX_BLOCK_SIZE as usize;
9701    assert!(
9702        driver.hc_matcher().table.history.capacity() >= expected_history_ceiling,
9703        "huge source + dict hint must reserve the clamped HC history ceiling, got {}",
9704        driver.hc_matcher().table.history.capacity()
9705    );
9706
9707    let mut space = driver.get_next_space();
9708    space[..12].copy_from_slice(b"abcabcabcabc");
9709    space.truncate(12);
9710    driver.commit_space(space);
9711    driver.skip_matching_with_hint(None);
9712}
9713
9714#[test]
9715fn driver_chain_log_override_survives_row_to_hc_fallback() {
9716    // Regression: when a RowHash level is forced onto the HashChain backend
9717    // (resolved window <= 14, upstream `ZSTD_resolveRowMatchFinderMode`), the
9718    // synthesised HC chain table must honour an explicit `chain_log` override.
9719    // The RowHash override arm drops `chain_log` (Row has no chain table), so
9720    // the synthesis previously replaced the caller's `chain_log` with the upstream zstd
9721    // `hashLog - 1`, silently ignoring it on small-window frames.
9722    let chain_log_override = 10u32;
9723    let ov = super::parameters::ParamOverrides {
9724        chain_log: Some(chain_log_override),
9725        ..Default::default()
9726    };
9727    let mut driver = MatchGeneratorDriver::new(32, 2);
9728    // Small source hint pins the window to the hinted floor (16 KiB =
9729    // windowLog 14), so the Level 6 Row finder falls back to HashChain.
9730    driver.set_source_size_hint(1 << 12);
9731    driver.set_param_overrides(Some(ov));
9732    driver.reset(CompressionLevel::Level(6));
9733    let mut space = driver.get_next_space();
9734    space[..12].copy_from_slice(b"abcabcabcabc");
9735    space.truncate(12);
9736    driver.commit_space(space);
9737    driver.skip_matching_with_hint(None);
9738    // The override (10) is below the window cap (14), so the resolved HC chain
9739    // table must reflect it — NOT the upstream zstd `hashLog - 1` (18, clamped to the
9740    // window 14). Pre-fix this resolved to 14.
9741    assert_eq!(
9742        driver.hc_matcher().table.chain_log,
9743        chain_log_override as usize,
9744        "explicit chain_log override must survive the Row->HC fallback, got {}",
9745        driver.hc_matcher().table.chain_log
9746    );
9747}
9748
9749#[test]
9750fn driver_small_source_hint_shrinks_row_hash_tables() {
9751    let mut driver = MatchGeneratorDriver::new(32, 2);
9752
9753    driver.reset(CompressionLevel::Level(5));
9754    let mut space = driver.get_next_space();
9755    space[..12].copy_from_slice(b"abcabcabcabc");
9756    space.truncate(12);
9757    driver.commit_space(space);
9758    driver.skip_matching_with_hint(None);
9759    let full_rows = driver.row_matcher().row_heads.len();
9760    // Level 5 uses the upstream row_log (clamp(searchLog=3, 4, 6) = 4) and the
9761    // upstream L5 hashLog (`ZSTD_getCParams(5,..).hashLog` = 19), so the row
9762    // count is 1 << (ROW_L5.hash_bits - ROW_L5.row_log).
9763    assert_eq!(full_rows, 1 << (ROW_L5.hash_bits - ROW_L5.row_log));
9764
9765    // A hint that keeps the resolved window > 14 STILL uses the Row finder
9766    // (upstream `ZSTD_resolveRowMatchFinderMode`: row mode on for windowLog > 14)
9767    // and shrinks the row hash table to the source-derived width. 64 KiB →
9768    // raw source log 16, so `row_hash_bits_for_window(1 << 16)` < the level's
9769    // full hash_bits (19) and the row count drops.
9770    driver.set_source_size_hint(1 << 16);
9771    driver.reset(CompressionLevel::Level(5));
9772    let mut space = driver.get_next_space();
9773    space[..12].copy_from_slice(b"xyzxyzxyzxyz");
9774    space.truncate(12);
9775    driver.commit_space(space);
9776    driver.skip_matching_with_hint(None);
9777    assert_eq!(
9778        driver.active_backend(),
9779        super::strategy::BackendTag::Row,
9780        "windowLog > 14 keeps the upstream row matchfinder"
9781    );
9782    let hinted_rows = driver.row_matcher().row_heads.len();
9783    assert!(
9784        hinted_rows < full_rows,
9785        "a window>14 source hint should reduce the row hash table footprint"
9786    );
9787
9788    // A tiny hint floors the resolved window at MIN_HINTED_WINDOW_LOG = 14;
9789    // upstream uses the HASH-CHAIN matcher (not Row) at windowLog <= 14, so the
9790    // driver must route greedy/lazy/lazy2 to the HashChain backend there.
9791    driver.set_source_size_hint(1024);
9792    driver.reset(CompressionLevel::Level(5));
9793    assert_eq!(driver.window_size(), 1 << MIN_HINTED_WINDOW_LOG);
9794    assert_eq!(
9795        driver.active_backend(),
9796        super::strategy::BackendTag::HashChain,
9797        "windowLog <= 14 must fall back to the upstream zstd hash-chain matchfinder",
9798    );
9799}
9800
9801#[test]
9802fn row_matches_roundtrip_multi_block_pattern() {
9803    let pattern = [7, 13, 44, 184, 19, 96, 171, 109, 141, 251];
9804    let first_block: Vec<u8> = pattern.iter().copied().cycle().take(128 * 1024).collect();
9805    let second_block: Vec<u8> = pattern.iter().copied().cycle().take(128 * 1024).collect();
9806
9807    let mut matcher = RowMatchGenerator::new(1 << 22);
9808    matcher.configure(ROW_CONFIG);
9809    matcher.ensure_tables();
9810    let replay_sequence = |decoded: &mut Vec<u8>, seq: Sequence<'_>| match seq {
9811        Sequence::Literals { literals } => decoded.extend_from_slice(literals),
9812        Sequence::Triple {
9813            literals,
9814            offset,
9815            match_len,
9816        } => {
9817            decoded.extend_from_slice(literals);
9818            let start = decoded.len() - offset;
9819            for i in 0..match_len {
9820                let byte = decoded[start + i];
9821                decoded.push(byte);
9822            }
9823        }
9824    };
9825
9826    matcher.add_data(first_block.clone(), |_| {});
9827    let mut history = Vec::new();
9828    matcher.start_matching(|seq| replay_sequence(&mut history, seq));
9829    assert_eq!(history, first_block);
9830
9831    matcher.add_data(second_block.clone(), |_| {});
9832    let prefix_len = history.len();
9833    matcher.start_matching(|seq| replay_sequence(&mut history, seq));
9834
9835    assert_eq!(&history[prefix_len..], second_block.as_slice());
9836
9837    // Force a literals-only pass so the Sequence::Literals arm is exercised.
9838    let third_block: Vec<u8> = (0u8..=255).collect();
9839    matcher.add_data(third_block.clone(), |_| {});
9840    let third_prefix = history.len();
9841    matcher.start_matching(|seq| replay_sequence(&mut history, seq));
9842    assert_eq!(&history[third_prefix..], third_block.as_slice());
9843}
9844
9845#[test]
9846fn row_short_block_emits_literals_only() {
9847    let mut matcher = RowMatchGenerator::new(1 << 22);
9848    matcher.configure(ROW_CONFIG);
9849
9850    matcher.add_data(b"abcde".to_vec(), |_| {});
9851
9852    let mut saw_triple = false;
9853    let mut reconstructed = Vec::new();
9854    matcher.start_matching(|seq| match seq {
9855        Sequence::Literals { literals } => reconstructed.extend_from_slice(literals),
9856        Sequence::Triple { .. } => saw_triple = true,
9857    });
9858
9859    assert!(
9860        !saw_triple,
9861        "row backend must not emit triples for short blocks"
9862    );
9863    assert_eq!(reconstructed, b"abcde");
9864
9865    // Then feed a clearly matchable block and ensure the Triple arm is reachable.
9866    saw_triple = false;
9867    matcher.add_data(b"abcdeabcde".to_vec(), |_| {});
9868    matcher.start_matching(|seq| {
9869        if let Sequence::Triple { .. } = seq {
9870            saw_triple = true;
9871        }
9872    });
9873    assert!(
9874        saw_triple,
9875        "row backend should emit triples on repeated data"
9876    );
9877}
9878
9879#[test]
9880fn row_pick_lazy_returns_best_when_lookahead_is_out_of_bounds() {
9881    let mut matcher = RowMatchGenerator::new(1 << 22);
9882    matcher.configure(ROW_CONFIG);
9883    matcher.add_data(b"abcabc".to_vec(), |_| {});
9884    // Build the row tables before probing: the lookahead path reaches
9885    // `row_candidate` -> `row_heads[..]` once the accept floor is small
9886    // enough to pass the length gate, so the tables must be allocated
9887    // (production always calls this before any candidate probe).
9888    matcher.ensure_tables();
9889
9890    let best = MatchCandidate {
9891        start: 0,
9892        offset: 1,
9893        match_len: ROW_MIN_MATCH_LEN,
9894    };
9895    let picked = matcher
9896        .pick_lazy_match(0, 0, Some(best))
9897        .expect("best candidate must survive");
9898
9899    assert_eq!(picked.start, best.start);
9900    assert_eq!(picked.offset, best.offset);
9901    assert_eq!(picked.match_len, best.match_len);
9902}
9903
9904#[test]
9905fn row_backfills_previous_block_tail_for_cross_boundary_match() {
9906    let mut matcher = RowMatchGenerator::new(1 << 22);
9907    matcher.configure(ROW_CONFIG);
9908
9909    let mut first_block = alloc::vec![0xA5; 64];
9910    first_block.extend_from_slice(b"XYZ");
9911    let second_block = b"XYZXYZtail".to_vec();
9912
9913    let replay_sequence = |decoded: &mut Vec<u8>, seq: Sequence<'_>| match seq {
9914        Sequence::Literals { literals } => decoded.extend_from_slice(literals),
9915        Sequence::Triple {
9916            literals,
9917            offset,
9918            match_len,
9919        } => {
9920            decoded.extend_from_slice(literals);
9921            let start = decoded.len() - offset;
9922            for i in 0..match_len {
9923                let byte = decoded[start + i];
9924                decoded.push(byte);
9925            }
9926        }
9927    };
9928
9929    matcher.add_data(first_block.clone(), |_| {});
9930    let mut reconstructed = Vec::new();
9931    matcher.start_matching(|seq| replay_sequence(&mut reconstructed, seq));
9932    assert_eq!(reconstructed, first_block);
9933
9934    matcher.add_data(second_block.clone(), |_| {});
9935    let mut saw_cross_boundary = false;
9936    let prefix_len = reconstructed.len();
9937    matcher.start_matching(|seq| {
9938        if let Sequence::Triple {
9939            literals,
9940            offset,
9941            match_len,
9942        } = seq
9943            && literals.is_empty()
9944            && offset == 3
9945            && match_len >= ROW_MIN_MATCH_LEN
9946        {
9947            saw_cross_boundary = true;
9948        }
9949        replay_sequence(&mut reconstructed, seq);
9950    });
9951
9952    assert!(
9953        saw_cross_boundary,
9954        "row matcher should reuse the 3-byte previous-block tail"
9955    );
9956    assert_eq!(&reconstructed[prefix_len..], second_block.as_slice());
9957}
9958
9959#[test]
9960fn row_skip_matching_with_incompressible_hint_uses_sparse_prefix() {
9961    let data = deterministic_high_entropy_bytes(0xA713_9C5D_44E2_10B1, 4096);
9962
9963    let mut dense = RowMatchGenerator::new(1 << 22);
9964    dense.configure(ROW_CONFIG);
9965    dense.add_data(data.clone(), |_| {});
9966    dense.skip_matching_with_hint(Some(false));
9967    let dense_slots = dense
9968        .row_positions
9969        .iter()
9970        .filter(|&&pos| pos != ROW_EMPTY_SLOT)
9971        .count();
9972
9973    let mut sparse = RowMatchGenerator::new(1 << 22);
9974    sparse.configure(ROW_CONFIG);
9975    sparse.add_data(data, |_| {});
9976    sparse.skip_matching_with_hint(Some(true));
9977    let sparse_slots = sparse
9978        .row_positions
9979        .iter()
9980        .filter(|&&pos| pos != ROW_EMPTY_SLOT)
9981        .count();
9982
9983    assert!(
9984        sparse_slots < dense_slots,
9985        "incompressible hint should seed fewer row slots (sparse={sparse_slots}, dense={dense_slots})"
9986    );
9987}
9988
9989/// Regression for the `None` arm of `skip_matching_with_hint`: the
9990/// row table must NOT receive dense inserts across the skipped range.
9991/// Upstream zstd parity (`ZSTD_row_fillHashCache` only pre-fills the next-scan
9992/// cache, not the skipped block's interior) trades cross-block
9993/// matches into the skipped interior for the per-block O(block_size)
9994/// insert cost.
9995///
9996/// At input < 1 block (4096 B with default 128 KiB block boundary),
9997/// the only positions in the row table after the call should be those
9998/// produced by the `backfill_start` lookback at the block's start
9999/// (≤ `ROW_HASH_KEY_LEN - 1` positions when block_start <
10000/// ROW_HASH_KEY_LEN). For `current_abs_start == 0`, even that backfill
10001/// is empty — so the table stays fully empty.
10002#[test]
10003fn row_skip_matching_with_none_hint_leaves_interior_empty() {
10004    let data = deterministic_high_entropy_bytes(0x9B47_F2A1_8C5E_3306, 4096);
10005
10006    let mut none_hint = RowMatchGenerator::new(1 << 22);
10007    none_hint.configure(ROW_CONFIG);
10008    none_hint.add_data(data.clone(), |_| {});
10009    none_hint.skip_matching_with_hint(None);
10010    let none_slots = none_hint
10011        .row_positions
10012        .iter()
10013        .filter(|&&pos| pos != ROW_EMPTY_SLOT)
10014        .count();
10015
10016    // Dense (Some(false), dict-priming path) for comparison — that
10017    // path inserts every position in the skipped range.
10018    let mut dense = RowMatchGenerator::new(1 << 22);
10019    dense.configure(ROW_CONFIG);
10020    dense.add_data(data, |_| {});
10021    dense.skip_matching_with_hint(Some(false));
10022    let dense_slots = dense
10023        .row_positions
10024        .iter()
10025        .filter(|&&pos| pos != ROW_EMPTY_SLOT)
10026        .count();
10027
10028    // Two assertions pin the contract:
10029    // 1) None hint is dramatically sparser than dense (the whole point).
10030    // 2) None hint at block-start==0 inserts ZERO positions (no
10031    //    backfill possible before position 0).
10032    assert_eq!(
10033        none_slots, 0,
10034        "None hint at block_start=0 must leave row table fully empty \
10035         (upstream zstd parity — interior NOT inserted, no pre-block backfill possible)",
10036    );
10037    assert!(
10038        dense_slots > 0,
10039        "Some(false) dict-priming path must still insert densely \
10040         (sanity check: control case for the `none_slots == 0` assertion)",
10041    );
10042}
10043
10044#[test]
10045fn driver_unhinted_level2_keeps_default_dfast_hash_table_size() {
10046    let mut driver = MatchGeneratorDriver::new(32, 2);
10047
10048    driver.reset(CompressionLevel::Level(3));
10049    let mut space = driver.get_next_space();
10050    space[..12].copy_from_slice(b"abcabcabcabc");
10051    space.truncate(12);
10052    driver.commit_space(space);
10053    driver.skip_matching_with_hint(None);
10054
10055    // Upstream zstd-parity split: long-hash at DFAST_HASH_BITS, short-hash one
10056    // bit smaller (DFAST_SHORT_HASH_BITS_DELTA = 1, matching upstream zstd
10057    // `chainLog = hashLog - 1` for dfast levels).
10058    let long_len = driver.dfast_matcher().long_len();
10059    let short_len = driver.dfast_matcher().short_len();
10060    assert_eq!(
10061        long_len,
10062        1 << DFAST_HASH_BITS,
10063        "unhinted Level(2) should keep default long-hash table size"
10064    );
10065    assert_eq!(
10066        short_len,
10067        1 << (DFAST_HASH_BITS - DFAST_SHORT_HASH_BITS_DELTA),
10068        "unhinted Level(2) short-hash should be one bit smaller than long-hash"
10069    );
10070}
10071
10072#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
10073#[test]
10074fn simple_backend_rejects_undersized_pooled_suffix_store() {
10075    let mut driver = MatchGeneratorDriver::new(128 * 1024, 2);
10076    driver.reset(CompressionLevel::Fastest);
10077
10078    driver.suffix_pool.push(SuffixStore::with_capacity(1024));
10079
10080    let mut space = driver.get_next_space();
10081    space.clear();
10082    space.resize(4096, 0xAB);
10083    driver.commit_space(space);
10084
10085    let last_suffix_slots = driver
10086        .simple()
10087        .window
10088        .last()
10089        .expect("window entry must exist after commit")
10090        .suffixes
10091        .slots
10092        .len();
10093    assert!(
10094        last_suffix_slots >= 4096,
10095        "undersized pooled suffix store must not be reused for larger blocks"
10096    );
10097}
10098
10099#[test]
10100fn source_hint_clamps_driver_slice_size_to_window() {
10101    let mut driver = MatchGeneratorDriver::new(128 * 1024, 2);
10102    driver.set_source_size_hint(1024);
10103    driver.reset(CompressionLevel::Default);
10104
10105    let window = driver.window_size() as usize;
10106    assert_eq!(window, 1 << MIN_HINTED_WINDOW_LOG);
10107    assert_eq!(driver.slice_size, window);
10108
10109    let space = driver.get_next_space();
10110    assert_eq!(space.len(), window);
10111    driver.commit_space(space);
10112}
10113
10114#[test]
10115fn pooled_space_keeps_capacity_when_slice_size_shrinks() {
10116    let mut driver = MatchGeneratorDriver::new(128 * 1024, 2);
10117    driver.reset(CompressionLevel::Default);
10118
10119    let large = driver.get_next_space();
10120    let large_capacity = large.capacity();
10121    assert!(large_capacity >= 128 * 1024);
10122    driver.commit_space(large);
10123
10124    driver.set_source_size_hint(1024);
10125    driver.reset(CompressionLevel::Default);
10126
10127    let small = driver.get_next_space();
10128    assert_eq!(small.len(), 1 << MIN_HINTED_WINDOW_LOG);
10129    assert!(
10130        small.capacity() >= large_capacity,
10131        "pooled buffer capacity should be preserved to avoid shrink/grow churn"
10132    );
10133}
10134
10135#[test]
10136fn driver_best_to_fastest_releases_oversized_hc_tables() {
10137    let mut driver = MatchGeneratorDriver::new(32, 2);
10138
10139    // Initialize at Best routed onto HashChain via the test-only override
10140    // (production `Best` sits on level 13, whose native backend differs) —
10141    // allocates large HC tables (4M hash, 2M chain) so the swap below
10142    // exercises the HC drain path this test pins.
10143    driver.reset_on_hc_lazy(CompressionLevel::Best);
10144    assert_eq!(driver.window_size(), (1u64 << 22));
10145
10146    // Feed data so tables are actually allocated via ensure_tables().
10147    let mut space = driver.get_next_space();
10148    space[..12].copy_from_slice(b"abcabcabcabc");
10149    space.truncate(12);
10150    driver.commit_space(space);
10151    driver.skip_matching_with_hint(None);
10152
10153    // Switch to Fastest — the [`MatcherStorage`] enum swaps to the
10154    // `Simple` variant and the `HashChain` variant is dropped. The
10155    // drain block in `Matcher::reset` reassigns
10156    // `m.table.hash_table` / `chain_table` / `hash3_table` to
10157    // `Vec::new()` BEFORE constructing the replacement variant so the
10158    // table backing allocations are released up front — this caps
10159    // peak memory during the swap to "old data buffers being drained
10160    // into `vec_pool` + new `MatchGenerator` skeleton" rather than
10161    // "old tables still resident + new variant under construction".
10162    // The eventual `Drop` on the old variant would release the tables
10163    // anyway, but only after the new variant is built, so the early
10164    // reassign shifts the peak. Post-switch the HC variant no longer
10165    // exists; the assertion that storage is now `Simple` covers the
10166    // invariant the old hash_table/chain_table checks were proxying.
10167    driver.reset(CompressionLevel::Fastest);
10168    assert_eq!(driver.window_size(), (1u64 << 19));
10169    assert_eq!(driver.active_backend(), super::strategy::BackendTag::Simple);
10170}
10171
10172#[test]
10173fn driver_better_to_best_resizes_hc_tables() {
10174    let mut driver = MatchGeneratorDriver::new(32, 2);
10175
10176    // The lazy band runs on the Row backend now, so the HC resize path is
10177    // exercised across two BT levels whose native `HcConfig` widths differ:
10178    // L13 (hash_log 22, chain_log 22) -> L15 (hash_log 23, chain_log 23).
10179    driver.reset(CompressionLevel::Level(13));
10180    assert_eq!(driver.window_size(), (1u64 << 22));
10181
10182    let mut space = driver.get_next_space();
10183    space[..12].copy_from_slice(b"abcabcabcabc");
10184    space.truncate(12);
10185    driver.commit_space(space);
10186    driver.skip_matching_with_hint(None);
10187
10188    let hc = driver.hc_matcher();
10189    let better_hash_len = hc.table.hash_table.len();
10190    let better_chain_len = hc.table.chain_table.len();
10191
10192    // Switch to L15 — must resize to larger tables.
10193    driver.reset(CompressionLevel::Level(15));
10194    assert_eq!(driver.window_size(), (1u64 << 22));
10195
10196    // Feed data to trigger ensure_tables with new sizes.
10197    let mut space = driver.get_next_space();
10198    space[..12].copy_from_slice(b"xyzxyzxyzxyz");
10199    space.truncate(12);
10200    driver.commit_space(space);
10201    driver.skip_matching_with_hint(None);
10202
10203    let hc = driver.hc_matcher();
10204    assert!(
10205        hc.table.hash_table.len() > better_hash_len,
10206        "L15 hash_table ({}) should be larger than L13 ({})",
10207        hc.table.hash_table.len(),
10208        better_hash_len
10209    );
10210    assert!(
10211        hc.table.chain_table.len() > better_chain_len,
10212        "L15 chain_table ({}) should be larger than L13 ({})",
10213        hc.table.chain_table.len(),
10214        better_chain_len
10215    );
10216}
10217
10218#[cfg(any())]
10219// disabled: tests legacy SuffixStore behavior incompatible with upstream zstd-shape kernel's HASH_READ_SIZE geometry
10220#[test]
10221fn prime_with_dictionary_preserves_history_for_first_full_block() {
10222    let mut driver = MatchGeneratorDriver::new(8, 1);
10223    driver.reset(CompressionLevel::Fastest);
10224
10225    driver.prime_with_dictionary(b"abcdefgh", [1, 4, 8]);
10226
10227    let mut space = driver.get_next_space();
10228    space.clear();
10229    space.extend_from_slice(b"abcdefgh");
10230    driver.commit_space(space);
10231
10232    let mut saw_match = false;
10233    driver.start_matching(|seq| {
10234        if let Sequence::Triple {
10235            literals,
10236            offset,
10237            match_len,
10238        } = seq
10239            && literals.is_empty()
10240            && offset == 8
10241            && match_len >= MIN_MATCH_LEN
10242        {
10243            saw_match = true;
10244        }
10245    });
10246
10247    assert!(
10248        saw_match,
10249        "first full block should still match dictionary-primed history"
10250    );
10251}
10252
10253#[cfg(any())]
10254// disabled: tests legacy SuffixStore behavior incompatible with upstream zstd-shape kernel's HASH_READ_SIZE geometry
10255#[test]
10256fn prime_with_large_dictionary_preserves_early_history_until_first_block() {
10257    let mut driver = MatchGeneratorDriver::new(8, 1);
10258    driver.reset(CompressionLevel::Fastest);
10259
10260    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
10261
10262    let mut space = driver.get_next_space();
10263    space.clear();
10264    space.extend_from_slice(b"abcdefgh");
10265    driver.commit_space(space);
10266
10267    let mut saw_match = false;
10268    driver.start_matching(|seq| {
10269        if let Sequence::Triple {
10270            literals,
10271            offset,
10272            match_len,
10273        } = seq
10274            && literals.is_empty()
10275            && offset == 24
10276            && match_len >= MIN_MATCH_LEN
10277        {
10278            saw_match = true;
10279        }
10280    });
10281
10282    assert!(
10283        saw_match,
10284        "dictionary bytes should remain addressable until frame output exceeds the live window"
10285    );
10286}
10287
10288#[test]
10289fn prime_with_dictionary_applies_offset_history_even_when_content_is_empty() {
10290    let mut driver = MatchGeneratorDriver::new(8, 1);
10291    driver.reset(CompressionLevel::Fastest);
10292
10293    driver.prime_with_dictionary(&[], [11, 7, 3]);
10294
10295    assert_eq!(driver.simple_mut().offset_hist, [11, 7, 3]);
10296}
10297
10298#[test]
10299fn hc_prime_with_empty_dictionary_disables_btultra2_seed_pass() {
10300    let mut driver = MatchGeneratorDriver::new(8, 1);
10301    driver.reset_on_hc_lazy(CompressionLevel::Better);
10302
10303    driver.prime_with_dictionary(&[], [11, 7, 3]);
10304
10305    assert_eq!(driver.hc_matcher().table.offset_hist, [11, 7, 3]);
10306    assert!(
10307        !driver
10308            .hc_matcher()
10309            .should_run_btultra2_seed_pass::<super::strategy::BtUltra2>(HC_PREDEF_THRESHOLD + 1),
10310        "btultra2 warmup must stay disabled after dictionary priming, even when dict content is empty"
10311    );
10312}
10313
10314#[test]
10315fn primed_snapshot_not_restored_across_ldm_config_change() {
10316    // The CDict-equivalent primed snapshot clones `storage`, which on the
10317    // BT backend carries `BtMatcher::ldm_producer`. A snapshot captured
10318    // under one LDM configuration must NOT be restored into a reset that
10319    // resolved a different LDM configuration (else the restored producer
10320    // is stale). `PrimedKey` must fold the LDM override into the key so
10321    // such a restore is refused and the caller re-primes.
10322    use super::parameters::CompressionParameters;
10323
10324    let dict = b"abcdefghabcdefghabcdefgh";
10325    let ldm_on = CompressionParameters::builder(CompressionLevel::Level(19))
10326        .enable_long_distance_matching(true)
10327        .build()
10328        .unwrap()
10329        .overrides();
10330    let ldm_off = CompressionParameters::builder(CompressionLevel::Level(19))
10331        .build()
10332        .unwrap()
10333        .overrides();
10334
10335    let mut driver = MatchGeneratorDriver::new(1024, 1);
10336
10337    // Capture a snapshot primed under LDM-on at level 19.
10338    driver.set_param_overrides(Some(ldm_on));
10339    driver.reset(CompressionLevel::Level(19));
10340    driver.prime_with_dictionary(dict, [1, 4, 8]);
10341    driver.capture_primed_dictionary(CompressionLevel::Level(19));
10342
10343    // Same dictionary + level, but LDM now OFF: the snapshot's LDM state
10344    // is stale, so restore must be refused.
10345    driver.set_param_overrides(Some(ldm_off));
10346    driver.reset(CompressionLevel::Level(19));
10347    assert!(
10348        !driver.restore_primed_dictionary(CompressionLevel::Level(19)),
10349        "primed snapshot restored across an LDM config change (stale producer)",
10350    );
10351
10352    // Sanity: re-priming + capturing under LDM-off, then restoring under
10353    // the IDENTICAL LDM-off config DOES match (the key is not over-tight).
10354    driver.prime_with_dictionary(dict, [1, 4, 8]);
10355    driver.capture_primed_dictionary(CompressionLevel::Level(19));
10356    driver.reset(CompressionLevel::Level(19));
10357    assert!(
10358        driver.restore_primed_dictionary(CompressionLevel::Level(19)),
10359        "primed snapshot not restored under identical LDM config",
10360    );
10361}
10362
10363#[test]
10364fn hc_prime_with_dictionary_disables_btultra2_seed_pass() {
10365    let mut driver = MatchGeneratorDriver::new(8, 1);
10366    driver.reset_on_hc_lazy(CompressionLevel::Better);
10367
10368    driver.prime_with_dictionary(b"abcdefgh", [1, 4, 8]);
10369
10370    assert!(
10371        !driver
10372            .hc_matcher()
10373            .should_run_btultra2_seed_pass::<super::strategy::BtUltra2>(HC_PREDEF_THRESHOLD + 1),
10374        "btultra2 warmup must stay disabled after dictionary priming with content"
10375    );
10376}
10377
10378#[test]
10379fn dfast_prime_with_dictionary_preserves_history_for_first_full_block() {
10380    let mut driver = MatchGeneratorDriver::new(8, 1);
10381    // Level(4) is Dfast with the greedy double-fast loop (upstream zstd parity:
10382    // clevels.h L3/L4 are both `ZSTD_dfast`, which has no lazy lookahead).
10383    // The fast loop needs at least `HASH_READ_SIZE` (8) bytes ahead of the
10384    // probe cursor, so this exercises a 16-byte dict + 16-byte block (the
10385    // whole block matches the dict, offset = dict length = 16).
10386    driver.reset(CompressionLevel::Level(4));
10387
10388    let payload = b"abcdefghijklmnop";
10389    driver.prime_with_dictionary(payload, [1, 4, 8]);
10390
10391    let mut space = driver.get_next_space();
10392    space.clear();
10393    space.extend_from_slice(payload);
10394    driver.commit_space(space);
10395
10396    let mut saw_match = false;
10397    driver.start_matching(|seq| {
10398        if let Sequence::Triple {
10399            literals,
10400            offset,
10401            match_len,
10402        } = seq
10403            && literals.is_empty()
10404            && offset == payload.len()
10405            && match_len >= DFAST_MIN_MATCH_LEN
10406        {
10407            saw_match = true;
10408        }
10409    });
10410
10411    assert!(
10412        saw_match,
10413        "dfast backend should match dictionary-primed history in first full block"
10414    );
10415}
10416
10417#[test]
10418fn prime_with_dictionary_does_not_inflate_reported_window_size() {
10419    let mut driver = MatchGeneratorDriver::new(8, 1);
10420    driver.reset(CompressionLevel::Fastest);
10421
10422    let before = driver.window_size();
10423    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
10424    let after = driver.window_size();
10425
10426    assert_eq!(
10427        after, before,
10428        "dictionary retention budget must not change reported frame window size"
10429    );
10430}
10431
10432#[test]
10433fn primed_snapshot_not_restored_when_window_hint_differs() {
10434    // The copy-snapshot must be keyed on the resolved reset parameters, not
10435    // just the CompressionLevel. `reset()` caps window_log by the source-size
10436    // hint, so two same-level frames with different hints resolve to different
10437    // windows. Restoring a snapshot captured at the larger hint into a reset
10438    // for the smaller hint would advertise the smaller window in the frame
10439    // header while the matcher's `max_window_size` (from the restored storage)
10440    // still spans the larger window — the encoder could then emit a match
10441    // (e.g. into the dictionary) past the advertised window, producing an
10442    // undecodable frame. Restore must REFUSE when the resolved window differs.
10443    let mut driver = MatchGeneratorDriver::new(8, 1);
10444    let level = CompressionLevel::Best;
10445
10446    // Frame A: large hint → larger resolved window. Prime + capture.
10447    driver.set_source_size_hint(256 * 1024);
10448    driver.reset(level);
10449    let big_window = driver.window_size();
10450    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
10451    driver.capture_primed_dictionary(level);
10452
10453    // Frame B: smaller hint, SAME level → smaller resolved window.
10454    driver.set_source_size_hint(48 * 1024);
10455    driver.reset(level);
10456    let small_window = driver.window_size();
10457    assert!(
10458        small_window < big_window,
10459        "precondition: the two hints must resolve to different windows \
10460         (small={small_window}, big={big_window})"
10461    );
10462
10463    let restored = driver.restore_primed_dictionary(level);
10464    assert!(
10465        !restored,
10466        "snapshot captured at window {big_window} must NOT be restored into a \
10467         reset advertising window {small_window} (level alone is an insufficient key)"
10468    );
10469}
10470
10471#[test]
10472fn primed_snapshot_restored_for_hints_in_same_window_bucket() {
10473    // The snapshot key must normalize the source-size hint to the resolved
10474    // matcher geometry, not the raw hinted byte count. `reset()` derives every
10475    // hint-dependent parameter (window_log cap, HC/Fast/Dfast/Row table widths,
10476    // the Fast attach-vs-copy cutoff) from `ceil_log2(hint)`, so two distinct
10477    // hints that share a ceil-log bucket resolve to the *identical* matcher
10478    // shape. Keying on the raw bytes over-keys: it forces a full re-prime on the
10479    // second frame even though the cached snapshot is a perfect fit. Restore
10480    // must SUCCEED across same-bucket hints.
10481    let mut driver = MatchGeneratorDriver::new(8, 1);
10482    let level = CompressionLevel::Best;
10483
10484    // Both hints fall in ceil_log2 bucket 19 (2^18 < n <= 2^19): 300 KiB and
10485    // 400 KiB resolve to the same window and table widths.
10486    driver.set_source_size_hint(300 * 1024);
10487    driver.reset(level);
10488    let window_a = driver.window_size();
10489    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
10490    driver.capture_primed_dictionary(level);
10491
10492    driver.set_source_size_hint(400 * 1024);
10493    driver.reset(level);
10494    let window_b = driver.window_size();
10495    assert_eq!(
10496        window_a, window_b,
10497        "precondition: same-bucket hints must resolve to the same window \
10498         (a={window_a}, b={window_b})"
10499    );
10500
10501    let restored = driver.restore_primed_dictionary(level);
10502    assert!(
10503        restored,
10504        "snapshot captured at a 300 KiB hint must be restored into a 400 KiB \
10505         hint that resolves to the identical matcher shape (raw bytes over-key)"
10506    );
10507}
10508
10509#[test]
10510fn primed_snapshot_restored_across_level22_tier_hints() {
10511    // Level 22 collapses several ceil-log buckets onto one upstream zstd source-size
10512    // tier: `resolve_level_params(Level(22), ..)` selects the HC config and
10513    // window_log by raw `<= 16 KiB / 128 KiB / 256 KiB` thresholds, so a 20 KiB
10514    // and a 100 KiB hint (ceil-log buckets 15 and 17) both land in the
10515    // `<= 128 KiB` tier and resolve to the IDENTICAL matcher (same window_log,
10516    // same HC hash/chain/search geometry). Keying on the raw ceil-log bucket
10517    // would still reject the restore here because the buckets differ; the key
10518    // must compare the resolved matcher shape so these share one snapshot.
10519    let mut driver = MatchGeneratorDriver::new(8, 1);
10520    let level = CompressionLevel::Level(22);
10521
10522    driver.set_source_size_hint(20 * 1024);
10523    driver.reset(level);
10524    let window_a = driver.window_size();
10525    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
10526    driver.capture_primed_dictionary(level);
10527
10528    driver.set_source_size_hint(100 * 1024);
10529    driver.reset(level);
10530    let window_b = driver.window_size();
10531    assert_eq!(
10532        window_a, window_b,
10533        "precondition: both hints must land in the same Level 22 upstream zstd tier \
10534         (a={window_a}, b={window_b})"
10535    );
10536
10537    let restored = driver.restore_primed_dictionary(level);
10538    assert!(
10539        restored,
10540        "Level 22 snapshot captured at a 20 KiB hint must be restored into a \
10541         100 KiB hint that resolves to the same upstream zstd tier (different ceil-log \
10542         buckets, identical matcher shape)"
10543    );
10544}
10545
10546#[test]
10547fn fast_dict_attaches_within_cutoff_bounds() {
10548    // Within the attach bounds, every Fast dict frame attaches (the copy-mode
10549    // owned path memmoved the whole input into history each frame; attach scans
10550    // the input in place via the borrowed dual-base kernel). All hints here sit
10551    // far below `FAST_ATTACH_DICT_CUTOFF_LOG` (2 GiB source) and the dict is far
10552    // below `MAX_FAST_ATTACH_DICT_REGION` (16 MiB), so a hint that used to cross
10553    // the old 8 KiB cutoff (8193 B) and a small one (8192 B) BOTH resolve to
10554    // attach, and the Simple backend reports a borrowed (in-place) dict scan for
10555    // both. This guards `FAST_ATTACH_DICT_CUTOFF_LOG` staying high enough that no
10556    // in-bounds Fast hint falls back to the input-copy path; the OUT-of-bounds
10557    // fallbacks are covered by `fast_attach_cutoff_keeps_virtual_positions_within_u32`
10558    // (source) and `oversized_dict_hint_routes_fast_to_copy_mode` (dict size).
10559    let level = CompressionLevel::Level(1);
10560    for hint in [8192u64, 8193, 1 << 20] {
10561        let mut driver = MatchGeneratorDriver::new(8, 1);
10562        driver.set_source_size_hint(hint);
10563        driver.reset(level);
10564        driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
10565        assert!(
10566            driver.borrowed_dict_supported(),
10567            "Fast dict frame with hint {hint} must attach (borrowed in-place \
10568             dict scan), never fall back to the copy-mode input-copy path"
10569        );
10570    }
10571}
10572
10573#[test]
10574fn fast_attach_cutoff_keeps_virtual_positions_within_u32() {
10575    // The cutoff is 31, NOT the full u64 source-size range, because the borrowed
10576    // dict kernel stores virtual positions as u32 (`cur_abs as u32`). The largest
10577    // attached source `1 << CUTOFF` (plus the dict prefix) must stay below
10578    // u32::MAX or that arithmetic wraps; the next bucket (4 GiB) would. This pins
10579    // the bound so a future "just raise it to attach everything" change cannot
10580    // silently reintroduce the overflow — raising the cutoff requires widening
10581    // the kernel's position type first.
10582    let max_attached: u64 = 1u64 << FAST_ATTACH_DICT_CUTOFF_LOG;
10583    assert!(
10584        max_attached <= u32::MAX as u64,
10585        "the largest attached source 2^{FAST_ATTACH_DICT_CUTOFF_LOG} must fit u32 \
10586         virtual positions",
10587    );
10588    assert!(
10589        (1u64 << (FAST_ATTACH_DICT_CUTOFF_LOG + 1)) > u32::MAX as u64,
10590        "the next bucket 2^{} would overflow u32 virtual positions",
10591        FAST_ATTACH_DICT_CUTOFF_LOG + 1,
10592    );
10593}
10594
10595#[test]
10596fn oversized_dict_hint_routes_fast_to_copy_mode() {
10597    // A dict whose region exceeds the tagged attach position field
10598    // (`MAX_FAST_ATTACH_DICT_REGION`, 16 MiB) must route the Fast prime to COPY
10599    // mode instead of the tagged attach fill, which would overflow the packed
10600    // position. The decision is keyed on the load-set size hint, so a hint past
10601    // the limit suffices to exercise it without allocating a real 16 MiB dict.
10602    // Copy mode leaves the borrowed in-place dict scan (attach-only) unavailable.
10603    let mut driver = MatchGeneratorDriver::new(8, 1);
10604    driver.set_dictionary_size_hint(MAX_FAST_ATTACH_DICT_REGION + 1);
10605    driver.reset(CompressionLevel::Level(1));
10606    driver.prime_with_dictionary(b"small dict content with some padding here", [1, 4, 8]);
10607    assert!(
10608        !driver.borrowed_dict_supported(),
10609        "an oversized dict must use copy mode, not the tagged attach fill"
10610    );
10611}
10612
10613#[test]
10614fn block_samples_match_dict_is_true_for_non_simple_backend() {
10615    // Production fallback: a non-Simple backend (here Row, Level 6) has no dict
10616    // probe, so the driver wrapper answers CONSERVATIVELY `true` for ANY block —
10617    // keeping the dict frame on the scan rather than letting the raw-fast-path
10618    // emit a block raw and miss an embedded dict segment (see
10619    // `dictionary_segment_in_incompressible_input_is_matched`). Only the
10620    // Simple/Fast backend trades the blanket scan for a precise probe.
10621    let dict = b"the quick brown fox jumps over the lazy dog 0123456789abcdef";
10622    let mut row = MatchGeneratorDriver::new(8, 6);
10623    row.set_dictionary_size_hint(dict.len());
10624    row.reset(CompressionLevel::Level(6));
10625    row.prime_with_dictionary(dict, [1, 4, 8]);
10626    assert!(
10627        row.block_samples_match_dict(&dict[..32]),
10628        "non-Simple backend must stay on the scan (true) for a dict frame"
10629    );
10630    let random: alloc::vec::Vec<u8> = (0..64u8)
10631        .map(|i| i.wrapping_mul(37).wrapping_add(13))
10632        .collect();
10633    assert!(
10634        row.block_samples_match_dict(&random),
10635        "non-Simple backend reports true regardless of block content"
10636    );
10637}
10638
10639#[test]
10640fn primed_snapshot_fast_attach_does_not_over_key_non_simple_backends() {
10641    // `fast_attach` is a Simple/Fast-backend concept (the 8 KiB attach-vs-copy
10642    // table split). Dfast/Row/HashChain each have their OWN attach/copy regime
10643    // (`DFAST_ATTACH_DICT_CUTOFF_LOG`, `ROW_ATTACH_DICT_CUTOFF_LOG`,
10644    // `HC_ATTACH_DICT_CUTOFF_LOG`) but those are deliberately kept OUT of the
10645    // `fast_attach` key, which only models the Fast table split. Their snapshots
10646    // are keyed by the resolved matcher geometry instead, and the HC modes share
10647    // one window geometry so an HC cross-mode restore stays decodable (see
10648    // `prime_with_dictionary`). Either way the `fast_attach`
10649    // bit must NOT enter a non-Simple snapshot key — otherwise an unhinted
10650    // capture (which would record `fast_attach = true`) and a hinted reset that
10651    // resolves to the IDENTICAL `LevelParams` would key differently and force a
10652    // needless re-prime. `Best` is a Row-backend lazy
10653    // level; this also pins the Row arm recording its RESOLVED hash width on
10654    // the unhinted path (a 0 default there keyed unhinted-vs-hinted apart).
10655    // An explicit Row-backend level: `Best` now sits on level 13 (Btlazy2),
10656    // so the named alias no longer reaches the Row arm this test pins.
10657    let mut driver = MatchGeneratorDriver::new(8, 1);
10658    let level = CompressionLevel::Level(12);
10659
10660    // Capture with no hint.
10661    driver.reset(level);
10662    let window_a = driver.window_size();
10663    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
10664    driver.capture_primed_dictionary(level);
10665
10666    // Reset with a hint large enough to resolve to the same window/params as
10667    // the unhinted level (>= 2^window_log, so the source-size cap is a no-op).
10668    driver.set_source_size_hint(64 * 1024 * 1024);
10669    driver.reset(level);
10670    let window_b = driver.window_size();
10671    assert_eq!(
10672        window_a, window_b,
10673        "precondition: the large hint must resolve to the same window as the \
10674         unhinted level (a={window_a}, b={window_b})"
10675    );
10676
10677    let restored = driver.restore_primed_dictionary(level);
10678    assert!(
10679        restored,
10680        "a Row snapshot must restore across an unhinted vs large-hinted \
10681         reset that resolves to the identical matcher — `fast_attach` is a Fast \
10682         backend concept and must not over-key non-Simple shapes"
10683    );
10684}
10685
10686#[cfg(any())] // disabled: tested SuffixStore-per-block tail-handling specific to legacy MatchGenerator
10687#[test]
10688fn prime_with_dictionary_does_not_reuse_tiny_suffix_store() {
10689    let mut driver = MatchGeneratorDriver::new(8, 2);
10690    driver.reset(CompressionLevel::Fastest);
10691
10692    // This dictionary leaves a 1-byte tail chunk (capacity=1 suffix table),
10693    // which should never be committed to the matcher window.
10694    driver.prime_with_dictionary(b"abcdefghi", [1, 4, 8]);
10695
10696    assert!(
10697        driver
10698            .simple()
10699            .window
10700            .iter()
10701            .all(|entry| entry.data.len() >= MIN_MATCH_LEN),
10702        "dictionary priming must not commit tails shorter than MIN_MATCH_LEN"
10703    );
10704}
10705
10706#[test]
10707fn prime_with_dictionary_counts_only_committed_tail_budget() {
10708    let mut driver = MatchGeneratorDriver::new(8, 1);
10709    driver.reset(CompressionLevel::Fastest);
10710
10711    let before = driver.simple_mut().max_window_size;
10712    // One full slice plus a 1-byte tail that cannot be committed.
10713    driver.prime_with_dictionary(b"abcdefghi", [1, 4, 8]);
10714
10715    assert_eq!(
10716        driver.simple_mut().max_window_size,
10717        before + 8,
10718        "retention budget must account only for dictionary bytes actually committed to history"
10719    );
10720}
10721
10722#[test]
10723fn dfast_prime_with_dictionary_counts_four_byte_tail_budget() {
10724    let mut driver = MatchGeneratorDriver::new(8, 1);
10725    driver.reset(CompressionLevel::Level(3));
10726
10727    let before = driver.dfast_matcher().max_window_size;
10728    // One full slice plus a 4-byte tail. Dfast can still use this tail through
10729    // short-hash overlap into the next block, so it should stay retained.
10730    driver.prime_with_dictionary(b"abcdefghijkl", [1, 4, 8]);
10731
10732    assert_eq!(
10733        driver.dfast_matcher().max_window_size,
10734        before + 12,
10735        "dfast retention budget should include 4-byte dictionary tails"
10736    );
10737}
10738
10739#[test]
10740fn row_prime_with_dictionary_preserves_history_for_first_full_block() {
10741    let mut driver = MatchGeneratorDriver::new(8, 1);
10742    // Level(5) is the greedy Row backend (LEVEL_TABLE row 5: Greedy / RowHash).
10743    // Level(4) now routes to Dfast, so this test must use Level(5) to actually
10744    // exercise `RowMatchGenerator`'s dictionary priming. The 16-byte dict +
10745    // 16-byte block lets the whole block match the primed dict (offset = dict
10746    // length = 16).
10747    driver.reset(CompressionLevel::Level(5));
10748
10749    let payload = b"abcdefghijklmnop";
10750    driver.prime_with_dictionary(payload, [1, 4, 8]);
10751
10752    let mut space = driver.get_next_space();
10753    space.clear();
10754    space.extend_from_slice(payload);
10755    driver.commit_space(space);
10756
10757    let mut saw_match = false;
10758    driver.start_matching(|seq| {
10759        if let Sequence::Triple {
10760            literals,
10761            offset,
10762            match_len,
10763        } = seq
10764            && literals.is_empty()
10765            && offset == payload.len()
10766            && match_len >= ROW_MIN_MATCH_LEN
10767        {
10768            saw_match = true;
10769        }
10770    });
10771
10772    assert!(
10773        saw_match,
10774        "row backend should match dictionary-primed history in first full block"
10775    );
10776}
10777
10778#[test]
10779fn row_prime_with_dictionary_subtracts_uncommitted_tail_budget() {
10780    let mut driver = MatchGeneratorDriver::new(8, 1);
10781    driver.reset(CompressionLevel::Level(5));
10782
10783    let base_window = driver.row_matcher().max_window_size;
10784    // Slice size is 8. The trailing byte cannot be committed (<4 tail),
10785    // so it must be subtracted from retained budget.
10786    driver.prime_with_dictionary(b"abcdefghi", [1, 4, 8]);
10787
10788    assert_eq!(
10789        driver.row_matcher().max_window_size,
10790        base_window + 8,
10791        "row retained window must exclude uncommitted 1-byte tail"
10792    );
10793}
10794
10795#[test]
10796fn prime_with_dictionary_budget_shrinks_after_row_eviction() {
10797    let mut driver = MatchGeneratorDriver::new(8, 1);
10798    driver.reset(CompressionLevel::Level(5));
10799    // Keep live window tiny so dictionary-primed slices are evicted quickly.
10800    driver.row_matcher_mut().max_window_size = 8;
10801    driver.reported_window_size = 8;
10802
10803    let base_window = driver.row_matcher().max_window_size;
10804    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
10805    assert_eq!(driver.row_matcher().max_window_size, base_window + 24);
10806
10807    for block in [b"AAAAAAAA", b"BBBBBBBB"] {
10808        let mut space = driver.get_next_space();
10809        space.clear();
10810        space.extend_from_slice(block);
10811        driver.commit_space(space);
10812        driver.skip_matching_with_hint(None);
10813    }
10814
10815    assert_eq!(
10816        driver.dictionary_retained_budget, 0,
10817        "dictionary budget should be fully retired once primed dict slices are evicted"
10818    );
10819    assert_eq!(
10820        driver.row_matcher().max_window_size,
10821        base_window,
10822        "retired dictionary budget must not remain reusable for live history"
10823    );
10824}
10825
10826/// Row → Simple transition drops the Row variant and the
10827/// post-switch active backend is exactly Simple. The window-emptied
10828/// check from the pre-enum era (`driver.row_matcher().window.is_empty()`)
10829/// is intentionally gone — the `Row` variant no longer exists after
10830/// the swap, so there is nothing to inspect by accessor; the "window
10831/// cleared" invariant is replaced by "variant dropped", and a
10832/// subsequent `row_matcher()` call would panic by design. The
10833/// pool-recycling side of the row backend is covered by
10834/// [`driver_row_commit_recycles_block_buffer_into_pool`].
10835#[test]
10836fn row_get_last_space_then_reset_to_fastest_drops_row_variant() {
10837    let mut driver = MatchGeneratorDriver::new(8, 1);
10838    driver.reset(CompressionLevel::Level(5));
10839    assert_eq!(driver.active_backend(), super::strategy::BackendTag::Row);
10840
10841    let mut space = driver.get_next_space();
10842    space.clear();
10843    space.extend_from_slice(b"row-data");
10844    driver.commit_space(space);
10845
10846    assert_eq!(driver.get_last_space(), b"row-data");
10847
10848    driver.reset(CompressionLevel::Fastest);
10849    assert_eq!(driver.active_backend(), super::strategy::BackendTag::Simple);
10850}
10851
10852/// Committing a Row block must return the input buffer to `vec_pool`
10853/// immediately (the bytes are mirrored into the contiguous `history`,
10854/// so there is no reason to retain a second copy in the window). This
10855/// guards the chunk-length window: the previous `VecDeque<Vec<u8>>`
10856/// window retained a full `block_capacity` buffer per committed block,
10857/// which on a heavily pre-split frame ballooned peak memory to many
10858/// times the live byte count. With the buffer recycled at commit time
10859/// the pool grows by exactly one Vec per committed block.
10860#[test]
10861fn driver_row_commit_recycles_block_buffer_into_pool() {
10862    let mut driver = MatchGeneratorDriver::new(8, 1);
10863    driver.reset(CompressionLevel::Level(5));
10864    assert_eq!(driver.active_backend(), super::strategy::BackendTag::Row);
10865
10866    let before_pool = driver.vec_pool.len();
10867    let mut space = driver.get_next_space();
10868    space.clear();
10869    space.extend_from_slice(b"row-data-to-recycle");
10870    driver.commit_space(space);
10871
10872    // `>` not `>=`: a fresh driver starts with `before_pool == 0`, so the
10873    // weaker bound passes even if the commit failed to recycle. Strict
10874    // growth proves the buffer was returned to the pool at commit time
10875    // rather than retained in the window (the pre-`chunk_lens` bug).
10876    assert!(
10877        driver.vec_pool.len() > before_pool,
10878        "row commit must recycle the committed block buffer into vec_pool \
10879         (before_pool = {before_pool}, after = {})",
10880        driver.vec_pool.len()
10881    );
10882    // The bytes still resolve through the contiguous history mirror.
10883    assert_eq!(driver.get_last_space(), b"row-data-to-recycle");
10884}
10885
10886#[test]
10887fn adjust_params_for_zero_source_size_uses_min_hinted_window_floor() {
10888    let mut params = resolve_level_params(CompressionLevel::Level(4), None);
10889    params.window_log = 22;
10890    let adjusted = adjust_params_for_source_size(params, 0);
10891    assert_eq!(adjusted.window_log, MIN_HINTED_WINDOW_LOG);
10892}
10893
10894#[test]
10895fn common_prefix_len_matches_scalar_reference_across_offsets() {
10896    fn scalar_reference(a: &[u8], b: &[u8]) -> usize {
10897        a.iter()
10898            .zip(b.iter())
10899            .take_while(|(lhs, rhs)| lhs == rhs)
10900            .count()
10901    }
10902
10903    for total_len in [
10904        0usize, 1, 5, 15, 16, 17, 31, 32, 33, 64, 65, 127, 191, 257, 320,
10905    ] {
10906        let base: Vec<u8> = (0..total_len)
10907            .map(|i| ((i * 13 + 7) & 0xFF) as u8)
10908            .collect();
10909
10910        for start in [0usize, 1, 3] {
10911            if start > total_len {
10912                continue;
10913            }
10914            let a = &base[start..];
10915            let b = a.to_vec();
10916            assert_eq!(
10917                common_prefix_len(a, &b),
10918                scalar_reference(a, &b),
10919                "equal slices total_len={total_len} start={start}"
10920            );
10921
10922            let len = a.len();
10923            for mismatch in [0usize, 1, 7, 15, 16, 31, 32, 47, 63, 95, 127, 128, 129, 191] {
10924                if mismatch >= len {
10925                    continue;
10926                }
10927                let mut altered = b.clone();
10928                altered[mismatch] ^= 0x5A;
10929                assert_eq!(
10930                    common_prefix_len(a, &altered),
10931                    scalar_reference(a, &altered),
10932                    "total_len={total_len} start={start} mismatch={mismatch}"
10933                );
10934            }
10935
10936            if len > 0 {
10937                let mismatch = len - 1;
10938                let mut altered = b.clone();
10939                altered[mismatch] ^= 0xA5;
10940                assert_eq!(
10941                    common_prefix_len(a, &altered),
10942                    scalar_reference(a, &altered),
10943                    "tail mismatch total_len={total_len} start={start} mismatch={mismatch}"
10944                );
10945            }
10946        }
10947    }
10948
10949    let long = alloc::vec![0xAB; 320];
10950    let shorter = alloc::vec![0xAB; 137];
10951    assert_eq!(
10952        common_prefix_len(&long, &shorter),
10953        scalar_reference(&long, &shorter)
10954    );
10955}
10956
10957#[test]
10958fn row_pick_lazy_returns_none_when_next_is_better() {
10959    let mut matcher = RowMatchGenerator::new(1 << 22);
10960    matcher.configure(ROW_CONFIG);
10961    matcher.add_data(alloc::vec![b'a'; 64], |_| {});
10962    matcher.ensure_tables();
10963
10964    let abs_pos = matcher.history_abs_start + 16;
10965    let best = MatchCandidate {
10966        start: abs_pos,
10967        offset: 8,
10968        match_len: ROW_MIN_MATCH_LEN,
10969    };
10970    assert!(
10971        matcher.pick_lazy_match(abs_pos, 0, Some(best)).is_none(),
10972        "lazy picker should defer when next position is clearly better"
10973    );
10974}
10975
10976#[test]
10977fn row_pick_lazy_depth2_returns_none_when_next2_significantly_better() {
10978    let mut matcher = RowMatchGenerator::new(1 << 22);
10979    matcher.configure(ROW_CONFIG);
10980    matcher.lazy_depth = 2;
10981    matcher.search_depth = 0;
10982    matcher.offset_hist = [6, 9, 1];
10983
10984    let mut data = alloc::vec![b'x'; 40];
10985    data[11..30].copy_from_slice(b"EFABCABCAEFABCAEFAB");
10986    matcher.add_data(data, |_| {});
10987    matcher.ensure_tables();
10988
10989    let abs_pos = matcher.history_abs_start + 20;
10990    let best = matcher
10991        .best_match(abs_pos, 0)
10992        .expect("expected baseline repcode match");
10993    assert_eq!(best.offset, 9);
10994    // Baseline match length is fixed by the fixture data (the offset-9
10995    // rep run is 6 bytes long), independent of the accept threshold.
10996    assert_eq!(best.match_len, 6);
10997
10998    if let Some(next) = matcher.best_match(abs_pos + 1, 1) {
10999        assert!(next.match_len <= best.match_len);
11000    }
11001
11002    let next2 = matcher
11003        .best_match(abs_pos + 2, 2)
11004        .expect("expected +2 candidate");
11005    assert!(
11006        next2.match_len > best.match_len + 1,
11007        "+2 candidate must be significantly better for depth-2 lazy skip"
11008    );
11009    assert!(
11010        matcher.pick_lazy_match(abs_pos, 0, Some(best)).is_none(),
11011        "lazy picker should defer when +2 candidate is significantly better"
11012    );
11013}
11014
11015#[test]
11016fn row_pick_lazy_depth2_keeps_best_when_next2_is_only_one_byte_better() {
11017    let mut matcher = RowMatchGenerator::new(1 << 22);
11018    matcher.configure(ROW_CONFIG);
11019    matcher.lazy_depth = 2;
11020    matcher.search_depth = 0;
11021    matcher.offset_hist = [6, 9, 1];
11022
11023    let mut data = alloc::vec![b'x'; 40];
11024    data[11..30].copy_from_slice(b"EFABCABCAEFABCAEFAZ");
11025    matcher.add_data(data, |_| {});
11026    matcher.ensure_tables();
11027
11028    let abs_pos = matcher.history_abs_start + 20;
11029    let best = matcher
11030        .best_match(abs_pos, 0)
11031        .expect("expected baseline repcode match");
11032    assert_eq!(best.offset, 9);
11033    // Baseline match length is fixed by the fixture data (the offset-9
11034    // rep run is 6 bytes long), independent of the accept threshold.
11035    assert_eq!(best.match_len, 6);
11036
11037    let next2 = matcher
11038        .best_match(abs_pos + 2, 2)
11039        .expect("expected +2 candidate");
11040    assert_eq!(next2.match_len, best.match_len + 1);
11041    let chosen = matcher
11042        .pick_lazy_match(abs_pos, 0, Some(best))
11043        .expect("lazy picker should keep current best");
11044    assert_eq!(chosen.start, best.start);
11045    assert_eq!(chosen.offset, best.offset);
11046    assert_eq!(chosen.match_len, best.match_len);
11047}
11048
11049/// Verifies row/tag extraction uses the shared hash mix bit-splitting contract.
11050#[test]
11051fn row_hash_and_row_extracts_high_bits() {
11052    let mut matcher = RowMatchGenerator::new(1 << 22);
11053    matcher.configure(ROW_CONFIG);
11054    matcher.add_data(
11055        alloc::vec![
11056            0xAA, 0xBB, 0xCC, 0x11, 0x10, 0x20, 0x30, 0x40, 0xAA, 0xBB, 0xCC, 0x22, 0x50, 0x60,
11057            0x70, 0x80,
11058        ],
11059        |_| {},
11060    );
11061    matcher.ensure_tables();
11062
11063    let pos = matcher.history_abs_start + 8;
11064    let (row, tag) = matcher
11065        .hash_and_row(pos)
11066        .expect("row hash should be available");
11067
11068    let idx = pos - matcher.history_abs_start;
11069    let concat = matcher.live_history();
11070    // Mirror `row_key_value`: an mls-wide masked key when 8 lookahead bytes
11071    // exist, the 4-byte key in the tail. `idx = 8` on a 16-byte history has
11072    // exactly 8 bytes left, so the wide arm applies here.
11073    let key_len = matcher.mls.min(6);
11074    let value = u64::from_le_bytes(concat[idx..idx + 8].try_into().unwrap())
11075        & ((1u64 << (key_len * 8)) - 1);
11076    let hash = crate::encoding::fastpath::hash_mix_u64_with_kernel(matcher.hash_kernel, value);
11077    let total_bits = matcher.row_hash_log + ROW_TAG_BITS;
11078    let combined = hash >> (u64::BITS as usize - total_bits);
11079    let expected_row =
11080        ((combined >> ROW_TAG_BITS) as usize) & ((1usize << matcher.row_hash_log) - 1);
11081    let expected_tag = combined as u8;
11082
11083    assert_eq!(row, expected_row);
11084    assert_eq!(tag, expected_tag);
11085}
11086
11087#[test]
11088fn row_repcode_skips_candidate_before_history_start() {
11089    let mut matcher = RowMatchGenerator::new(1 << 22);
11090    matcher.configure(ROW_CONFIG);
11091    matcher.history = alloc::vec![b'a'; 20];
11092    matcher.history_start = 0;
11093    matcher.history_abs_start = 10;
11094    matcher.offset_hist = [3, 0, 0];
11095
11096    assert!(matcher.repcode_candidate(12, 1).is_none());
11097}
11098
11099#[test]
11100fn row_repcode_returns_none_when_position_too_close_to_history_end() {
11101    let mut matcher = RowMatchGenerator::new(1 << 22);
11102    matcher.configure(ROW_CONFIG);
11103    matcher.history = b"abcde".to_vec();
11104    matcher.history_start = 0;
11105    matcher.history_abs_start = 0;
11106    matcher.offset_hist = [1, 0, 0];
11107
11108    assert!(matcher.repcode_candidate(4, 1).is_none());
11109}
11110
11111#[cfg(all(feature = "std", target_arch = "x86_64"))]
11112#[test]
11113fn hash_mix_sse42_path_is_available_and_matches_accelerated_impl_when_supported() {
11114    use crate::encoding::fastpath::{self, FastpathKernel};
11115    if !is_x86_feature_detected!("sse4.2") {
11116        return;
11117    }
11118    let v = 0x0123_4567_89AB_CDEFu64;
11119    // SAFETY: feature check above guarantees SSE4.2 is available.
11120    let accelerated = unsafe { fastpath::sse42::hash_mix_u64(v) };
11121    // Dispatcher must resolve to SSE4.2 (or better) and produce the same mix.
11122    let dispatched = fastpath::dispatch_hash_mix_u64(v);
11123    let kernel = fastpath::select_kernel();
11124    if kernel == FastpathKernel::Sse42 {
11125        assert_eq!(dispatched, accelerated);
11126    } else {
11127        // AVX2 kernel uses the same CRC32 instruction under the hood.
11128        assert_eq!(dispatched, accelerated, "AVX2/SSE4.2 share CRC32 mix");
11129    }
11130}
11131
11132#[cfg(all(feature = "std", target_arch = "aarch64", target_endian = "little"))]
11133#[test]
11134fn hash_mix_crc_path_is_available_and_matches_accelerated_impl_when_supported() {
11135    use crate::encoding::fastpath;
11136    if !is_aarch64_feature_detected!("crc") {
11137        return;
11138    }
11139    let v = 0x0123_4567_89AB_CDEFu64;
11140    // SAFETY: feature check above guarantees CRC32 is available.
11141    let accelerated = unsafe { fastpath::neon::hash_mix_u64(v) };
11142    let dispatched = fastpath::dispatch_hash_mix_u64(v);
11143    assert_eq!(dispatched, accelerated);
11144}
11145
11146#[test]
11147fn hc_hash3_position_matches_hash3_formula() {
11148    let bytes = [b'a', b'b', b'c', b'd'];
11149    let read32 = u32::from_le_bytes(bytes);
11150    let expected = (((read32 << 8).wrapping_mul(HC_PRIME3BYTES)) >> (32 - HC3_HASH_LOG)) as usize;
11151    assert_eq!(
11152        super::match_table::storage::MatchTable::hash3_position(&bytes, HC3_HASH_LOG),
11153        expected
11154    );
11155}
11156
11157#[test]
11158fn hc_hash_position_matches_hash4_formula() {
11159    let mut hc = HcMatchGenerator::new(1 << 20);
11160    hc.configure(HC_CONFIG, super::strategy::StrategyTag::Lazy, 22);
11161    let bytes = [b'a', b'b', b'c', b'd'];
11162    let read32 = u32::from_le_bytes(bytes);
11163    let expected = ((read32.wrapping_mul(HC_PRIME4BYTES)) >> (32 - hc.table.hash_log)) as usize;
11164    assert_eq!(hc.table.hash_position(&bytes), expected);
11165}
11166
11167#[test]
11168fn btultra2_main_hash_uses_hash4_formula() {
11169    let mut hc = HcMatchGenerator::new(1 << 20);
11170    hc.configure(
11171        BTULTRA2_HC_CONFIG_L22,
11172        super::strategy::StrategyTag::BtUltra2,
11173        27,
11174    );
11175    let bytes = [b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h'];
11176    let read32 = u32::from_le_bytes(bytes[..4].try_into().unwrap());
11177    let expected = ((read32.wrapping_mul(HC_PRIME4BYTES)) >> (32 - hc.table.hash_log)) as usize;
11178    let actual = super::match_table::storage::MatchTable::hash_position_with_mls(
11179        &bytes,
11180        hc.table.hash_log,
11181        super::bt::BtMatcher::HASH_MLS,
11182    );
11183    assert_eq!(actual, expected);
11184}
11185
11186#[test]
11187fn row_candidate_returns_none_when_abs_pos_near_end_of_history() {
11188    let mut matcher = RowMatchGenerator::new(1 << 22);
11189    matcher.configure(ROW_CONFIG);
11190    // One byte short of the accept floor: from abs_pos 0 there are fewer
11191    // than `ROW_MIN_MATCH_LEN` bytes left, so the length gate in
11192    // `row_candidate` must short-circuit to `None` before touching the
11193    // (here unbuilt) row tables.
11194    matcher.history = alloc::vec![b'a'; ROW_MIN_MATCH_LEN - 1];
11195    matcher.history_start = 0;
11196    matcher.history_abs_start = 0;
11197
11198    assert!(matcher.row_candidate(0, 0).is_none());
11199}
11200
11201#[test]
11202fn hc_chain_candidates_returns_sentinels_for_short_suffix() {
11203    let mut hc = HcMatchGenerator::new(32);
11204    hc.table.history = b"abc".to_vec();
11205    hc.table.history_start = 0;
11206    hc.table.history_abs_start = 0;
11207    hc.table.ensure_tables();
11208
11209    let candidates = hc.hc.chain_candidates(&hc.table, 0);
11210    assert!(candidates.iter().all(|&pos| pos == usize::MAX));
11211}
11212
11213#[test]
11214fn hc_reset_advances_floor_past_prior_frame_entries() {
11215    use super::match_table::storage::MatchTable;
11216    let mut hc = HcMatchGenerator::new(32);
11217    hc.table.add_data(b"abcdeabcde".to_vec(), |_| {});
11218    hc.table.ensure_tables();
11219    // Populate real hash / chain entries for the first frame's positions.
11220    hc.table.insert_positions(0, 6);
11221    let prev_end = hc.table.history_abs_end();
11222    assert_eq!(prev_end, 10);
11223    assert!(hc.table.hash_table.iter().any(|&v| v != HC_EMPTY));
11224
11225    hc.reset(|_| {});
11226
11227    // Behavioural contract: the previous frame's entries are no longer
11228    // matchable. `reset` advances the floor past every prior position
11229    // instead of zeroing the tables, so each populated slot now decodes
11230    // to an absolute position strictly below `history_abs_start` and is
11231    // rejected by the `window_low` guard before any byte is read.
11232    assert_eq!(hc.table.history_abs_start, prev_end);
11233    for &slot in hc.table.hash_table.iter() {
11234        if let Some(candidate_abs) =
11235            MatchTable::stored_abs_position_fast(slot, hc.table.position_base, hc.table.index_shift)
11236        {
11237            assert!(
11238                candidate_abs < hc.table.history_abs_start,
11239                "a prior-frame entry must resolve below the advanced floor"
11240            );
11241        }
11242    }
11243}
11244
11245#[test]
11246fn hc_reset_full_zeroes_when_floor_would_cross_ceiling() {
11247    use super::match_table::storage::REBASE_RESET_FLOOR_CEILING;
11248    let mut hc = HcMatchGenerator::new(32);
11249    hc.table.add_data(b"abcdeabcde".to_vec(), |_| {});
11250    hc.table.ensure_tables();
11251    hc.table.hash_table.fill(123);
11252    hc.table.chain_table.fill(456);
11253    // Push the would-be floor (`history_abs_end`) past the ceiling so
11254    // `reset` takes the bounded fallback: rewind to the origin and zero
11255    // the tables, keeping the absolute cursor from climbing toward
11256    // `usize::MAX` on 32-bit targets.
11257    hc.table.history_abs_start = REBASE_RESET_FLOOR_CEILING;
11258
11259    hc.reset(|_| {});
11260
11261    assert_eq!(hc.table.history_abs_start, 0);
11262    assert_eq!(hc.table.position_base, 0);
11263    assert!(hc.table.hash_table.iter().all(|&v| v == HC_EMPTY));
11264    assert!(hc.table.chain_table.iter().all(|&v| v == HC_EMPTY));
11265}
11266
11267#[test]
11268fn hc_start_matching_returns_early_for_empty_current_block() {
11269    let mut hc = HcMatchGenerator::new(32);
11270    hc.table.add_data(Vec::new(), |_| {});
11271    let mut called = false;
11272    hc.start_matching(|_| called = true);
11273    assert!(!called, "empty current block should not emit sequences");
11274}
11275
11276#[cfg(test)]
11277fn deterministic_high_entropy_bytes(seed: u64, len: usize) -> Vec<u8> {
11278    let mut out = Vec::with_capacity(len);
11279    let mut state = seed;
11280    for _ in 0..len {
11281        state ^= state << 13;
11282        state ^= state >> 7;
11283        state ^= state << 17;
11284        out.push((state >> 40) as u8);
11285    }
11286    out
11287}
11288
11289#[cfg(feature = "bench_internals")]
11290pub(crate) fn level22_block_ranges(data: &[u8]) -> Vec<(usize, usize)> {
11291    let mut ranges = Vec::new();
11292    let mut cursor = 0usize;
11293    let mut savings = 0i64;
11294    while cursor < data.len() {
11295        let remaining = data.len() - cursor;
11296        let candidate_len = remaining.min(super::cost_model::HC_BLOCKSIZE_MAX);
11297        let block_len = crate::encoding::frame_compressor::optimal_block_size(
11298            CompressionLevel::Level(22),
11299            &data[cursor..cursor + candidate_len],
11300            remaining,
11301            super::cost_model::HC_BLOCKSIZE_MAX,
11302            savings,
11303        )
11304        .min(candidate_len)
11305        .max(1);
11306        ranges.push((cursor, block_len));
11307        cursor += block_len;
11308        // The exact upstream zstd gate uses compressed-size savings. For this corpus
11309        // parity harness, after the first full block has compressed, savings is
11310        // sufficient to authorize the same pre-block splitter path.
11311        if cursor >= super::cost_model::HC_BLOCKSIZE_MAX {
11312            savings = 3;
11313        }
11314    }
11315    ranges
11316}
11317
11318#[cfg(feature = "bench_internals")]
11319fn merge_block_delimiters(sequences: Vec<(usize, usize, usize)>) -> Vec<(usize, usize, usize)> {
11320    let mut out = Vec::with_capacity(sequences.len());
11321    let mut pending_lits = 0usize;
11322    for (lit_len, offset, match_len) in sequences {
11323        if offset == 0 && match_len == 0 {
11324            pending_lits = pending_lits.saturating_add(lit_len);
11325            continue;
11326        }
11327        out.push((lit_len.saturating_add(pending_lits), offset, match_len));
11328        pending_lits = 0;
11329    }
11330    if pending_lits > 0 {
11331        out.push((pending_lits, 0, 0));
11332    }
11333    out
11334}
11335
11336/// White-box capture of the level-22 sequence stream (literal-length,
11337/// offset, match-length triples) the match generator emits for `data`,
11338/// with block-delimiter pseudo-sequences merged into the following
11339/// triple's literal run. Pure Rust; the C-conformance comparison that
11340/// consumes it lives in the `ffi-bench` crate.
11341#[cfg(feature = "bench_internals")]
11342pub(crate) fn collect_level22_sequences(data: &[u8]) -> Vec<(usize, usize, usize)> {
11343    merge_block_delimiters(collect_level22_sequences_with_delimiters(data))
11344        .into_iter()
11345        .filter(|(_, offset, match_len)| *offset != 0 || *match_len != 0)
11346        .collect()
11347}
11348
11349#[cfg(feature = "bench_internals")]
11350fn collect_level22_sequences_with_delimiters(data: &[u8]) -> Vec<(usize, usize, usize)> {
11351    let mut driver = MatchGeneratorDriver::new(super::cost_model::HC_BLOCKSIZE_MAX, 1);
11352    driver.set_source_size_hint(data.len() as u64);
11353    driver.reset(CompressionLevel::Level(22));
11354
11355    let mut sequences = Vec::new();
11356    for (chunk_start, chunk_len) in level22_block_ranges(data) {
11357        let chunk = &data[chunk_start..chunk_start + chunk_len];
11358        let mut space = driver.get_next_space();
11359        space[..chunk.len()].copy_from_slice(chunk);
11360        space.truncate(chunk.len());
11361        driver.commit_space(space);
11362        driver.start_matching(|seq| {
11363            let entry = match seq {
11364                Sequence::Literals { literals } => (literals.len(), 0usize, 0usize),
11365                Sequence::Triple {
11366                    literals,
11367                    offset,
11368                    match_len,
11369                } => (literals.len(), offset, match_len),
11370            };
11371            sequences.push(entry);
11372        });
11373    }
11374    sequences
11375}
11376
11377#[test]
11378fn hc_sparse_skip_matching_preserves_tail_cross_block_match() {
11379    let mut matcher = HcMatchGenerator::new(1 << 22);
11380    let tail = b"Qz9kLm2Rp";
11381    let mut first = deterministic_high_entropy_bytes(0xD1B5_4A32_9C77_0E19, 4096);
11382    let tail_start = first.len() - tail.len();
11383    first[tail_start..].copy_from_slice(tail);
11384    matcher.table.add_data(first.clone(), |_| {});
11385    matcher.skip_matching(Some(true));
11386
11387    let mut second = tail.to_vec();
11388    second.extend_from_slice(b"after-tail-literals");
11389    matcher.table.add_data(second, |_| {});
11390
11391    let mut first_sequence = None;
11392    matcher.start_matching(|seq| {
11393        if first_sequence.is_some() {
11394            return;
11395        }
11396        first_sequence = Some(match seq {
11397            Sequence::Literals { literals } => (literals.len(), 0usize, 0usize),
11398            Sequence::Triple {
11399                literals,
11400                offset,
11401                match_len,
11402            } => (literals.len(), offset, match_len),
11403        });
11404    });
11405
11406    let (literals_len, offset, match_len) =
11407        first_sequence.expect("expected at least one sequence after sparse skip");
11408    assert_eq!(
11409        literals_len, 0,
11410        "first sequence should start at block boundary"
11411    );
11412    assert_eq!(
11413        offset,
11414        tail.len(),
11415        "first match should reference previous tail"
11416    );
11417    assert!(
11418        match_len >= tail.len(),
11419        "tail-aligned cross-block match must be preserved"
11420    );
11421}
11422
11423#[test]
11424fn btultra2_sparse_skip_matching_preserves_tail_cross_block_match() {
11425    let mut matcher = HcMatchGenerator::new(1 << 20);
11426    matcher.configure(
11427        BTULTRA2_HC_CONFIG_L22,
11428        super::strategy::StrategyTag::BtUltra2,
11429        20,
11430    );
11431    let tail = b"Bt9kLm2Rp";
11432    let mut first = deterministic_high_entropy_bytes(0xA9C3_7F21_D4E8_510B, 4096);
11433    let tail_start = first.len() - tail.len();
11434    first[tail_start..].copy_from_slice(tail);
11435    matcher.table.add_data(first, |_| {});
11436    matcher.skip_matching(Some(true));
11437
11438    let mut second = tail.to_vec();
11439    second.extend_from_slice(b"after-tail-literals");
11440    matcher.table.add_data(second, |_| {});
11441
11442    let mut first_sequence = None;
11443    matcher.start_matching(|seq| {
11444        if first_sequence.is_some() {
11445            return;
11446        }
11447        first_sequence = Some(match seq {
11448            Sequence::Literals { literals } => (literals.len(), 0usize, 0usize),
11449            Sequence::Triple {
11450                literals,
11451                offset,
11452                match_len,
11453            } => (literals.len(), offset, match_len),
11454        });
11455    });
11456
11457    let (literals_len, offset, match_len) =
11458        first_sequence.expect("expected at least one sequence after sparse BT skip");
11459    assert_eq!(
11460        literals_len, 0,
11461        "BT sparse skip should preserve an immediate boundary match"
11462    );
11463    assert_eq!(
11464        offset,
11465        tail.len(),
11466        "first BT match should reference previous tail"
11467    );
11468    assert!(
11469        match_len >= tail.len(),
11470        "BT sparse skip must seed the dense tail for cross-block matching"
11471    );
11472}
11473
11474#[test]
11475fn hc_sparse_skip_matching_does_not_reinsert_sparse_tail_positions() {
11476    let mut matcher = HcMatchGenerator::new(1 << 22);
11477    let first = deterministic_high_entropy_bytes(0xC2B2_AE3D_27D4_EB4F, 4096);
11478    matcher.table.add_data(first.clone(), |_| {});
11479    matcher.skip_matching(Some(true));
11480
11481    let current_len = first.len();
11482    let current_abs_start =
11483        matcher.table.history_abs_start + matcher.table.window_size - current_len;
11484    let current_abs_end = current_abs_start + current_len;
11485    let dense_tail = HC_MIN_MATCH_LEN + INCOMPRESSIBLE_SKIP_STEP;
11486    let tail_start = current_abs_end
11487        .saturating_sub(dense_tail)
11488        .max(matcher.table.history_abs_start)
11489        .max(current_abs_start);
11490
11491    let overlap_pos = (tail_start..current_abs_end)
11492        .find(|&pos| (pos - current_abs_start).is_multiple_of(INCOMPRESSIBLE_SKIP_STEP))
11493        .expect("fixture should contain at least one sparse-grid overlap in dense tail");
11494
11495    let rel = matcher
11496        .table
11497        .relative_position(overlap_pos)
11498        .expect("overlap position should be representable as relative position");
11499    let chain_idx = rel as usize & ((1 << matcher.table.chain_log) - 1);
11500    assert_ne!(
11501        matcher.table.chain_table[chain_idx],
11502        rel + 1,
11503        "sparse-grid tail positions must not be reinserted (self-loop chain entry)"
11504    );
11505}
11506
11507#[test]
11508fn hc_compact_history_drains_when_threshold_crossed() {
11509    let mut hc = HcMatchGenerator::new(8);
11510    hc.table.history = b"abcdefghijklmnopqrstuvwxyz".to_vec();
11511    hc.table.history_start = 16;
11512    hc.table.compact_history();
11513    assert_eq!(hc.table.history_start, 0);
11514    assert_eq!(hc.table.history, b"qrstuvwxyz");
11515}
11516
11517#[test]
11518fn hc_insert_position_no_rebase_returns_when_relative_pos_unavailable() {
11519    let mut hc = HcMatchGenerator::new(32);
11520    hc.table.history = b"abcdefghijklmnop".to_vec();
11521    hc.table.history_abs_start = 0;
11522    hc.table.position_base = 1;
11523    hc.table.ensure_tables();
11524    let before_hash = hc.table.hash_table.clone();
11525    let before_chain = hc.table.chain_table.clone();
11526
11527    hc.table.insert_position_no_rebase(0);
11528
11529    assert_eq!(hc.table.hash_table, before_hash);
11530    assert_eq!(hc.table.chain_table, before_chain);
11531}
11532
11533#[test]
11534fn hc_insert_positions_advances_next_to_update3_for_contiguous_range() {
11535    let mut hc = HcMatchGenerator::new(64);
11536    hc.table.history = b"abcdefghijklmnopqrstuvwxyz".to_vec();
11537    hc.table.history_start = 0;
11538    hc.table.history_abs_start = 0;
11539    hc.table.position_base = 0;
11540    hc.table.ensure_tables();
11541    hc.table.next_to_update3 = 0;
11542
11543    hc.table.insert_positions(0, 9);
11544
11545    assert_eq!(
11546        hc.table.next_to_update3, 9,
11547        "contiguous insert_positions should advance hash3 update cursor"
11548    );
11549}
11550
11551#[test]
11552fn hc_insert_positions_with_step_keeps_next_to_update3_cursor_for_sparse_ranges() {
11553    let mut hc = HcMatchGenerator::new(64);
11554    hc.table.history = b"abcdefghijklmnopqrstuvwxyz".to_vec();
11555    hc.table.history_start = 0;
11556    hc.table.history_abs_start = 0;
11557    hc.table.position_base = 0;
11558    hc.table.ensure_tables();
11559    hc.table.next_to_update3 = 0;
11560
11561    hc.table.insert_positions_with_step(0, 16, 4);
11562
11563    assert_eq!(
11564        hc.table.next_to_update3, 0,
11565        "sparse insert_positions_with_step must not mark skipped positions as hash3-updated"
11566    );
11567}
11568
11569#[cfg(any())]
11570// disabled: tests legacy SuffixStore behavior incompatible with upstream zstd-shape kernel's HASH_READ_SIZE geometry
11571#[test]
11572fn prime_with_dictionary_budget_shrinks_after_simple_eviction() {
11573    let mut driver = MatchGeneratorDriver::new(8, 1);
11574    driver.reset(CompressionLevel::Fastest);
11575    // Use a small live window so dictionary-primed slices are evicted
11576    // quickly and budget retirement can be asserted deterministically.
11577    driver.simple_mut().max_window_size = 8;
11578    driver.reported_window_size = 8;
11579
11580    let base_window = driver.simple_mut().max_window_size;
11581    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
11582    assert_eq!(driver.simple_mut().max_window_size, base_window + 24);
11583
11584    for block in [b"AAAAAAAA", b"BBBBBBBB"] {
11585        let mut space = driver.get_next_space();
11586        space.clear();
11587        space.extend_from_slice(block);
11588        driver.commit_space(space);
11589        driver.skip_matching_with_hint(None);
11590    }
11591
11592    assert_eq!(
11593        driver.dictionary_retained_budget, 0,
11594        "dictionary budget should be fully retired once primed dict slices are evicted"
11595    );
11596    assert_eq!(
11597        driver.simple_mut().max_window_size,
11598        base_window,
11599        "retired dictionary budget must not remain reusable for live history"
11600    );
11601}
11602
11603#[test]
11604fn prime_with_dictionary_budget_shrinks_after_dfast_eviction() {
11605    let mut driver = MatchGeneratorDriver::new(8, 1);
11606    driver.reset(CompressionLevel::Level(3));
11607    // Use a small live window in this regression so dictionary-primed slices are
11608    // evicted quickly and budget retirement can be asserted deterministically.
11609    driver.dfast_matcher_mut().max_window_size = 8;
11610    driver.reported_window_size = 8;
11611
11612    let base_window = driver.dfast_matcher().max_window_size;
11613    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
11614    assert_eq!(driver.dfast_matcher().max_window_size, base_window + 24);
11615
11616    for block in [b"AAAAAAAA", b"BBBBBBBB"] {
11617        let mut space = driver.get_next_space();
11618        space.clear();
11619        space.extend_from_slice(block);
11620        driver.commit_space(space);
11621        driver.skip_matching_with_hint(None);
11622    }
11623
11624    assert_eq!(
11625        driver.dictionary_retained_budget, 0,
11626        "dictionary budget should be fully retired once primed dict slices are evicted"
11627    );
11628    assert_eq!(
11629        driver.dfast_matcher().max_window_size,
11630        base_window,
11631        "retired dictionary budget must not remain reusable for live history"
11632    );
11633}
11634
11635#[test]
11636fn hc_prime_with_dictionary_preserves_history_for_first_full_block() {
11637    let mut driver = MatchGeneratorDriver::new(8, 1);
11638    // Route onto HashChain explicitly — `Better` resolves to the Row
11639    // backend in production, and this test pins HC dict-prime behaviour.
11640    driver.reset_on_hc_lazy(CompressionLevel::Better);
11641
11642    driver.prime_with_dictionary(b"abcdefgh", [1, 4, 8]);
11643
11644    let mut space = driver.get_next_space();
11645    space.clear();
11646    // Repeat the dictionary content so the HC matcher can find it.
11647    // HC_MIN_MATCH_LEN is 5, so an 8-byte match is well above threshold.
11648    space.extend_from_slice(b"abcdefgh");
11649    driver.commit_space(space);
11650
11651    let mut saw_match = false;
11652    driver.start_matching(|seq| {
11653        if let Sequence::Triple {
11654            literals,
11655            offset,
11656            match_len,
11657        } = seq
11658            && literals.is_empty()
11659            && offset == 8
11660            && match_len >= HC_MIN_MATCH_LEN
11661        {
11662            saw_match = true;
11663        }
11664    });
11665
11666    assert!(
11667        saw_match,
11668        "hash-chain backend should match dictionary-primed history in first full block"
11669    );
11670}
11671
11672#[test]
11673fn prime_with_dictionary_budget_shrinks_after_hc_eviction() {
11674    let mut driver = MatchGeneratorDriver::new(8, 1);
11675    driver.reset_on_hc_lazy(CompressionLevel::Better);
11676    // Use a small live window so dictionary-primed slices are evicted quickly.
11677    driver.hc_matcher_mut().table.max_window_size = 8;
11678    driver.reported_window_size = 8;
11679
11680    let base_window = driver.hc_matcher().table.max_window_size;
11681    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
11682    assert_eq!(driver.hc_matcher().table.max_window_size, base_window + 24);
11683
11684    for block in [b"AAAAAAAA", b"BBBBBBBB"] {
11685        let mut space = driver.get_next_space();
11686        space.clear();
11687        space.extend_from_slice(block);
11688        driver.commit_space(space);
11689        driver.skip_matching_with_hint(None);
11690    }
11691
11692    assert_eq!(
11693        driver.dictionary_retained_budget, 0,
11694        "dictionary budget should be fully retired once primed dict slices are evicted"
11695    );
11696    assert_eq!(
11697        driver.hc_matcher().table.max_window_size,
11698        base_window,
11699        "retired dictionary budget must not remain reusable for live history"
11700    );
11701}
11702
11703#[test]
11704fn resident_reapply_restores_retained_dictionary_budget() {
11705    // A reused-dict frame that re-borrows the resident dictionary (skips the
11706    // re-prime) must restore the retained-dict budget the per-frame `reset`
11707    // cleared. The matcher's `reset` re-inflates `max_window_size` by the dict
11708    // region; without the restore the driver-level budget stays 0 and
11709    // `retire_dictionary_budget` never shrinks that inflated window as the dict
11710    // evicts. For the HashChain backend (whose `window_low` is measured against
11711    // `max_window_size`) that lets a post-eviction match exceed the frame
11712    // header's base window and emit an over-window offset.
11713    let mut driver = MatchGeneratorDriver::new(1 << 16, 1);
11714    let dict = b"abcdefghABCDEFGHijklmnopqrstuvwxyz0123456789";
11715    driver.set_dictionary_size_hint(dict.len());
11716    driver.reset_on_hc_lazy(CompressionLevel::Better);
11717    driver.prime_with_dictionary(dict, [1, 4, 8]);
11718    let base = driver.reported_window_size;
11719    assert!(
11720        driver.dictionary_retained_budget > 0,
11721        "the priming frame must retain a non-zero dict budget"
11722    );
11723
11724    // Second frame: the reset detects the resident dict and re-borrows it.
11725    driver.set_dictionary_size_hint(dict.len());
11726    driver.reset_on_hc_lazy(CompressionLevel::Better);
11727    assert!(
11728        driver.dictionary_is_resident(),
11729        "the second frame must re-borrow the resident dictionary"
11730    );
11731    assert_eq!(
11732        driver.dictionary_retained_budget, 0,
11733        "reset clears the retained-dict budget"
11734    );
11735    let inflated = driver.hc_matcher().table.max_window_size;
11736    assert!(
11737        inflated > base,
11738        "reset re-inflates the window by the resident dict region \
11739         (inflated={inflated}, base={base})"
11740    );
11741
11742    driver.reapply_resident_dictionary([1, 4, 8]);
11743    assert_eq!(
11744        driver.dictionary_retained_budget,
11745        inflated - base,
11746        "resident reapply must restore the retained-dict budget (= window \
11747         inflation) so the retire path can shrink the window as the dict evicts"
11748    );
11749}
11750
11751#[test]
11752fn hc_commit_without_eviction_retires_no_dictionary_budget() {
11753    // Regression: after the window<->history dedup, MatchTable::add_data
11754    // invokes its reuse_space callback for the *input* buffer (recycle),
11755    // not for evicted chunks. The HC arm of commit_space must therefore
11756    // derive eviction bytes from the window_size delta — counting the
11757    // callback argument as evicted would charge the whole committed block
11758    // as "evicted" and prematurely retire dictionary budget even when the
11759    // window is nowhere near full.
11760    let mut driver = MatchGeneratorDriver::new(8, 1);
11761    driver.reset_on_hc_lazy(CompressionLevel::Better);
11762    // A large live window so a small committed block evicts nothing.
11763    driver.hc_matcher_mut().table.max_window_size = 1 << 20;
11764    driver.reported_window_size = 1 << 20;
11765    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
11766    let budget_after_prime = driver.dictionary_retained_budget;
11767    assert!(
11768        budget_after_prime > 0,
11769        "priming must retain a non-zero dictionary budget"
11770    );
11771
11772    let mut space = driver.get_next_space();
11773    space.clear();
11774    space.extend_from_slice(b"AAAAAAAA");
11775    driver.commit_space(space);
11776    driver.skip_matching_with_hint(None);
11777
11778    assert_eq!(
11779        driver.dictionary_retained_budget, budget_after_prime,
11780        "a commit that evicts nothing must retire no dictionary budget"
11781    );
11782}
11783
11784#[test]
11785fn row_commit_without_eviction_retires_no_dictionary_budget() {
11786    // Regression for the Row arm of commit_space after the window ->
11787    // chunk_lens migration: RowMatchGenerator::add_data now invokes its
11788    // reuse_space callback for the *input* buffer (per-commit recycle),
11789    // not for evicted chunks. The Row arm must derive eviction bytes from
11790    // the window_size delta like the Dfast / HashChain arms — counting the
11791    // callback argument as evicted charges the whole committed block as
11792    // "evicted" and prematurely retires dictionary budget even when the
11793    // window is nowhere near full.
11794    let mut driver = MatchGeneratorDriver::new(8, 1);
11795    driver.reset(CompressionLevel::Level(5));
11796    assert!(matches!(driver.storage, MatcherStorage::Row(_)));
11797    // A large live window so a small committed block evicts nothing.
11798    driver.row_matcher_mut().max_window_size = 1 << 20;
11799    driver.reported_window_size = 1 << 20;
11800    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
11801    let budget_after_prime = driver.dictionary_retained_budget;
11802    assert!(
11803        budget_after_prime > 0,
11804        "priming must retain a non-zero dictionary budget"
11805    );
11806
11807    let mut space = driver.get_next_space();
11808    space.clear();
11809    space.extend_from_slice(b"AAAAAAAA");
11810    driver.commit_space(space);
11811    driver.skip_matching_with_hint(None);
11812
11813    assert_eq!(
11814        driver.dictionary_retained_budget, budget_after_prime,
11815        "a Row commit that evicts nothing must retire no dictionary budget"
11816    );
11817}
11818
11819#[test]
11820fn hc_rebases_positions_after_u32_boundary() {
11821    let mut matcher = HcMatchGenerator::new(64);
11822    matcher.table.add_data(b"abcdeabcdeabcde".to_vec(), |_| {});
11823    matcher.table.ensure_tables();
11824    matcher.table.position_base = 0;
11825    let history_abs_start: usize = match (u64::from(u32::MAX) + 64).try_into() {
11826        Ok(value) => value,
11827        Err(_) => return,
11828    };
11829    // Simulate a long-running stream where absolute history positions crossed
11830    // the u32 range. Before #51 this disabled HC inserts entirely.
11831    matcher.table.history_abs_start = history_abs_start;
11832    matcher.skip_matching(None);
11833    assert_eq!(
11834        matcher.table.position_base, matcher.table.history_abs_start,
11835        "rebase should anchor to the oldest live absolute position"
11836    );
11837
11838    assert!(
11839        matcher
11840            .table
11841            .hash_table
11842            .iter()
11843            .any(|entry| *entry != HC_EMPTY),
11844        "HC hash table should still be populated after crossing u32 boundary"
11845    );
11846
11847    // Verify rebasing preserves candidate lookup, not just table population.
11848    let abs_pos = matcher.table.history_abs_start + 10;
11849    let candidates = matcher.hc.chain_candidates(&matcher.table, abs_pos);
11850    assert!(
11851        candidates.iter().any(|candidate| *candidate != usize::MAX),
11852        "chain_candidates should return valid matches after rebase"
11853    );
11854}
11855
11856// 64-bit only: the >4 GiB absolute cursor this test fabricates cannot exist on
11857// a 32-bit target (usize == u32 can't address that much), and setting
11858// `history_abs_start` near `u32::MAX` there overflows `usize` in the
11859// `check_stream_abs_headroom` guard before the rebase path is reached. Mirrors
11860// the `try_into()` early-return guard on `hc_rebases_positions_after_u32_boundary`.
11861#[cfg(target_pointer_width = "64")]
11862#[test]
11863fn row_rebases_positions_after_u32_boundary() {
11864    // Row stores absolute match positions as u32. On a long stream the
11865    // cumulative absolute cursor crosses the u32 range even while the live
11866    // window stays bounded; `add_data` must rebase the coordinate origin
11867    // down to the oldest live byte instead of asserting. Before the rebase
11868    // landed this panicked on the `< u32::MAX` assertion, dropping valid
11869    // long Row-backed frames.
11870    let mut m = RowMatchGenerator::new(64);
11871    m.add_data(b"abcdeabcdeabcde".to_vec(), |_| {});
11872
11873    // Simulate ~4 GiB of stream behind a bounded window: the live bytes now
11874    // sit just under the u32 absolute ceiling.
11875    let near_ceiling = (u32::MAX as usize) - 16;
11876    m.history_abs_start = near_ceiling;
11877
11878    // The next commit would push a u32 position past the ceiling; add_data
11879    // must rebase the origin rather than panic.
11880    m.add_data(b"fghij".to_vec(), |_| {});
11881
11882    assert!(
11883        m.history_abs_start < near_ceiling,
11884        "add_data must rebase the absolute origin down when the cursor nears \
11885         u32::MAX (got {})",
11886        m.history_abs_start
11887    );
11888    assert!(
11889        (m.history_abs_start + m.window_size) < u32::MAX as usize,
11890        "after rebase the live window must fit below the u32 position ceiling"
11891    );
11892}
11893
11894#[test]
11895fn hc_rebase_rebuilds_only_inserted_prefix() {
11896    let mut matcher = HcMatchGenerator::new(64);
11897    matcher.table.add_data(b"abcdeabcdeabcde".to_vec(), |_| {});
11898    matcher.table.ensure_tables();
11899    matcher.table.position_base = 0;
11900    let history_abs_start: usize = match (u64::from(u32::MAX) + 64).try_into() {
11901        Ok(value) => value,
11902        Err(_) => return,
11903    };
11904    matcher.table.history_abs_start = history_abs_start;
11905    let abs_pos = matcher.table.history_abs_start + 6;
11906
11907    let mut expected = HcMatchGenerator::new(64);
11908    expected.table.add_data(b"abcdeabcdeabcde".to_vec(), |_| {});
11909    expected.table.ensure_tables();
11910    expected.table.history_abs_start = history_abs_start;
11911    expected.table.position_base = expected.table.history_abs_start;
11912    expected.table.hash_table.fill(HC_EMPTY);
11913    expected.table.chain_table.fill(HC_EMPTY);
11914    for pos in expected.table.history_abs_start..abs_pos {
11915        expected.table.insert_position_no_rebase(pos);
11916    }
11917
11918    matcher.table.maybe_rebase_positions(abs_pos);
11919
11920    assert_eq!(
11921        matcher.table.position_base, matcher.table.history_abs_start,
11922        "rebase should still anchor to the oldest live absolute position"
11923    );
11924    assert_eq!(
11925        matcher.table.hash_table, expected.table.hash_table,
11926        "rebase must rebuild only positions already inserted before abs_pos"
11927    );
11928    assert_eq!(
11929        matcher.table.chain_table, expected.table.chain_table,
11930        "future positions must not be pre-seeded into HC chains during rebase"
11931    );
11932}
11933
11934#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
11935#[test]
11936fn suffix_store_with_single_slot_does_not_panic_on_keying() {
11937    let mut suffixes = SuffixStore::with_capacity(1);
11938    suffixes.insert(b"abcde", 0);
11939    assert!(suffixes.contains_key(b"abcde"));
11940    assert_eq!(suffixes.get(b"abcde"), Some(0));
11941}
11942
11943#[cfg(any())]
11944// disabled: hash_fill_step is a legacy MatchGenerator field; FastKernelMatcher walks stride=1 today
11945#[test]
11946fn fastest_reset_uses_interleaved_hash_fill_step() {
11947    let mut driver = MatchGeneratorDriver::new(32, 2);
11948
11949    driver.reset(CompressionLevel::Uncompressed);
11950    assert_eq!(driver.simple().hash_fill_step, 1);
11951
11952    driver.reset(CompressionLevel::Fastest);
11953    assert_eq!(driver.simple().hash_fill_step, FAST_HASH_FILL_STEP);
11954
11955    // Better uses the HashChain backend with lazy2; verify that the backend switch
11956    // happened and the lazy_depth is configured correctly.
11957    driver.reset(CompressionLevel::Better);
11958    assert_eq!(
11959        driver.active_backend(),
11960        super::strategy::BackendTag::HashChain
11961    );
11962    assert_eq!(driver.window_size(), (1u64 << 23));
11963    assert_eq!(driver.hc_matcher().hc.lazy_depth, 2);
11964}
11965
11966#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
11967#[test]
11968fn simple_matcher_updates_offset_history_after_emitting_match() {
11969    let mut matcher = MatchGenerator::new(64);
11970    matcher.add_data(
11971        b"abcdeabcdeabcde".to_vec(),
11972        SuffixStore::with_capacity(64),
11973        |_, _| {},
11974    );
11975
11976    assert!(matcher.next_sequence(|seq| {
11977        assert_eq!(
11978            seq,
11979            Sequence::Triple {
11980                literals: b"abcde",
11981                offset: 5,
11982                match_len: 10,
11983            }
11984        );
11985    }));
11986    assert_eq!(matcher.offset_hist, [5, 1, 4]);
11987}
11988
11989#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
11990#[test]
11991fn simple_matcher_zero_literal_repcode_checks_rep1_before_hash_lookup() {
11992    let mut matcher = MatchGenerator::new(64);
11993    matcher.add_data(
11994        b"abcdefghijabcdefghij".to_vec(),
11995        SuffixStore::with_capacity(64),
11996        |_, _| {},
11997    );
11998
11999    matcher.suffix_idx = 10;
12000    matcher.last_idx_in_sequence = 10;
12001    matcher.offset_hist = [99, 10, 4];
12002
12003    let candidate = matcher.repcode_candidate(&matcher.window.last().unwrap().data[10..], 0);
12004    assert_eq!(candidate, Some((10, 10)));
12005}
12006
12007#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
12008#[test]
12009fn simple_matcher_repcode_can_target_previous_window_entry() {
12010    let mut matcher = MatchGenerator::new(64);
12011    matcher.add_data(
12012        b"abcdefghij".to_vec(),
12013        SuffixStore::with_capacity(64),
12014        |_, _| {},
12015    );
12016    matcher.skip_matching();
12017    matcher.add_data(
12018        b"abcdefghij".to_vec(),
12019        SuffixStore::with_capacity(64),
12020        |_, _| {},
12021    );
12022
12023    matcher.offset_hist = [99, 10, 4];
12024
12025    let candidate = matcher.repcode_candidate(&matcher.window.last().unwrap().data, 0);
12026    assert_eq!(candidate, Some((10, 10)));
12027}
12028
12029#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
12030#[test]
12031fn simple_matcher_zero_literal_repcode_checks_rep2() {
12032    let mut matcher = MatchGenerator::new(64);
12033    matcher.add_data(
12034        b"abcdefghijabcdefghij".to_vec(),
12035        SuffixStore::with_capacity(64),
12036        |_, _| {},
12037    );
12038    matcher.suffix_idx = 10;
12039    matcher.last_idx_in_sequence = 10;
12040    // rep1=4 does not match at idx 10, rep2=10 does.
12041    matcher.offset_hist = [99, 4, 10];
12042
12043    let candidate = matcher.repcode_candidate(&matcher.window.last().unwrap().data[10..], 0);
12044    assert_eq!(candidate, Some((10, 10)));
12045}
12046
12047#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
12048#[test]
12049fn simple_matcher_zero_literal_repcode_checks_rep0_minus1() {
12050    let mut matcher = MatchGenerator::new(64);
12051    matcher.add_data(
12052        b"abcdefghijabcdefghij".to_vec(),
12053        SuffixStore::with_capacity(64),
12054        |_, _| {},
12055    );
12056    matcher.suffix_idx = 10;
12057    matcher.last_idx_in_sequence = 10;
12058    // rep1=4 and rep2=99 do not match; rep0-1 == 10 does.
12059    matcher.offset_hist = [11, 4, 99];
12060
12061    let candidate = matcher.repcode_candidate(&matcher.window.last().unwrap().data[10..], 0);
12062    assert_eq!(candidate, Some((10, 10)));
12063}
12064
12065#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
12066#[test]
12067fn simple_matcher_repcode_rejects_offsets_beyond_searchable_prefix() {
12068    let mut matcher = MatchGenerator::new(64);
12069    matcher.add_data(
12070        b"abcdefghij".to_vec(),
12071        SuffixStore::with_capacity(64),
12072        |_, _| {},
12073    );
12074    matcher.skip_matching();
12075    matcher.add_data(
12076        b"klmnopqrst".to_vec(),
12077        SuffixStore::with_capacity(64),
12078        |_, _| {},
12079    );
12080    matcher.suffix_idx = 3;
12081
12082    let candidate = matcher.offset_match_len(14, &matcher.window.last().unwrap().data[3..]);
12083    assert_eq!(candidate, None);
12084}
12085
12086#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
12087#[test]
12088fn simple_matcher_skip_matching_seeds_every_position_even_with_fast_step() {
12089    let mut matcher = MatchGenerator::new(64);
12090    matcher.hash_fill_step = FAST_HASH_FILL_STEP;
12091    matcher.add_data(
12092        b"abcdefghijklmnop".to_vec(),
12093        SuffixStore::with_capacity(64),
12094        |_, _| {},
12095    );
12096    matcher.skip_matching();
12097    matcher.add_data(b"bcdef".to_vec(), SuffixStore::with_capacity(64), |_, _| {});
12098
12099    assert!(matcher.next_sequence(|seq| {
12100        assert_eq!(
12101            seq,
12102            Sequence::Triple {
12103                literals: b"",
12104                offset: 15,
12105                match_len: 5,
12106            }
12107        );
12108    }));
12109    assert!(!matcher.next_sequence(|_| {}));
12110}
12111
12112#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
12113#[test]
12114fn simple_matcher_skip_matching_with_incompressible_hint_uses_sparse_prefix() {
12115    let mut matcher = MatchGenerator::new(128);
12116    let first = b"abcdefghijklmnopqrstuvwxyz012345".to_vec();
12117    let sparse_probe = first[3..3 + MIN_MATCH_LEN].to_vec();
12118    let tail_start = first.len() - MIN_MATCH_LEN;
12119    let tail_probe = first[tail_start..tail_start + MIN_MATCH_LEN].to_vec();
12120    matcher.add_data(first, SuffixStore::with_capacity(256), |_, _| {});
12121
12122    matcher.skip_matching_with_hint(Some(true));
12123
12124    // Observable behavior check: sparse-prefix probe should not immediately match.
12125    matcher.add_data(sparse_probe, SuffixStore::with_capacity(256), |_, _| {});
12126    let mut sparse_first_is_literals = None;
12127    assert!(matcher.next_sequence(|seq| {
12128        if sparse_first_is_literals.is_none() {
12129            sparse_first_is_literals = Some(matches!(seq, Sequence::Literals { .. }));
12130        }
12131    }));
12132    assert!(
12133        sparse_first_is_literals.unwrap_or(false),
12134        "sparse-start probe should not produce an immediate match"
12135    );
12136
12137    // Dense tail remains indexed for cross-block boundary matching.
12138    let mut matcher = MatchGenerator::new(128);
12139    matcher.add_data(
12140        b"abcdefghijklmnopqrstuvwxyz012345".to_vec(),
12141        SuffixStore::with_capacity(256),
12142        |_, _| {},
12143    );
12144    matcher.skip_matching_with_hint(Some(true));
12145    matcher.add_data(tail_probe, SuffixStore::with_capacity(256), |_, _| {});
12146    let mut tail_first_is_immediate_match = None;
12147    assert!(matcher.next_sequence(|seq| {
12148        if tail_first_is_immediate_match.is_none() {
12149            tail_first_is_immediate_match =
12150                Some(matches!(seq, Sequence::Triple { literals, .. } if literals.is_empty()));
12151        }
12152    }));
12153    assert!(
12154        tail_first_is_immediate_match.unwrap_or(false),
12155        "dense tail probe should match immediately at block start"
12156    );
12157}
12158
12159#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
12160#[test]
12161fn simple_matcher_add_suffixes_till_backfills_last_searchable_anchor() {
12162    let mut matcher = MatchGenerator::new(64);
12163    matcher.hash_fill_step = FAST_HASH_FILL_STEP;
12164    matcher.add_data(
12165        b"01234abcde".to_vec(),
12166        SuffixStore::with_capacity(64),
12167        |_, _| {},
12168    );
12169    matcher.add_suffixes_till(10, FAST_HASH_FILL_STEP);
12170
12171    let last = matcher.window.last().unwrap();
12172    let tail = &last.data[5..10];
12173    assert_eq!(last.suffixes.get(tail), Some(5));
12174}
12175
12176#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
12177#[test]
12178fn simple_matcher_add_suffixes_till_skips_when_idx_below_min_match_len() {
12179    let mut matcher = MatchGenerator::new(128);
12180    matcher.hash_fill_step = FAST_HASH_FILL_STEP;
12181    matcher.add_data(
12182        b"abcdefghijklmnopqrstuvwxyz".to_vec(),
12183        SuffixStore::with_capacity(1 << 16),
12184        |_, _| {},
12185    );
12186
12187    matcher.add_suffixes_till(MIN_MATCH_LEN - 1, FAST_HASH_FILL_STEP);
12188
12189    let last = matcher.window.last().unwrap();
12190    let first_key = &last.data[..MIN_MATCH_LEN];
12191    assert_eq!(last.suffixes.get(first_key), None);
12192}
12193
12194#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
12195#[test]
12196fn simple_matcher_add_suffixes_till_fast_step_registers_interleaved_positions() {
12197    let mut matcher = MatchGenerator::new(128);
12198    matcher.hash_fill_step = FAST_HASH_FILL_STEP;
12199    matcher.add_data(
12200        b"abcdefghijklmnopqrstuvwxyz".to_vec(),
12201        SuffixStore::with_capacity(1 << 16),
12202        |_, _| {},
12203    );
12204
12205    matcher.add_suffixes_till(17, FAST_HASH_FILL_STEP);
12206
12207    let last = matcher.window.last().unwrap();
12208    for pos in [0usize, 3, 6, 9, 12] {
12209        let key = &last.data[pos..pos + MIN_MATCH_LEN];
12210        assert_eq!(
12211            last.suffixes.get(key),
12212            Some(pos),
12213            "expected interleaved suffix registration at pos {pos}"
12214        );
12215    }
12216}
12217
12218#[test]
12219fn dfast_skip_matching_handles_window_eviction() {
12220    let mut matcher = DfastMatchGenerator::new(16);
12221
12222    matcher.add_data(alloc::vec![1, 2, 3, 4, 5, 6], |_| {});
12223    matcher.skip_matching(None);
12224    matcher.add_data(alloc::vec![7, 8, 9, 10, 11, 12], |_| {});
12225    matcher.skip_matching(None);
12226    matcher.add_data(alloc::vec![7, 8, 9, 10, 11, 12], |_| {});
12227
12228    let mut reconstructed = alloc::vec![7, 8, 9, 10, 11, 12];
12229    matcher.start_matching(|seq| match seq {
12230        Sequence::Literals { literals } => reconstructed.extend_from_slice(literals),
12231        Sequence::Triple {
12232            literals,
12233            offset,
12234            match_len,
12235        } => {
12236            reconstructed.extend_from_slice(literals);
12237            let start = reconstructed.len() - offset;
12238            for i in 0..match_len {
12239                let byte = reconstructed[start + i];
12240                reconstructed.push(byte);
12241            }
12242        }
12243    });
12244
12245    assert_eq!(reconstructed, [7, 8, 9, 10, 11, 12, 7, 8, 9, 10, 11, 12]);
12246}
12247
12248#[test]
12249fn dfast_add_data_callback_reports_evicted_len_not_capacity() {
12250    let mut matcher = DfastMatchGenerator::new(8);
12251
12252    let mut first = Vec::with_capacity(64);
12253    first.extend_from_slice(b"abcdefgh");
12254    matcher.add_data(first, |_| {});
12255
12256    let mut second = Vec::with_capacity(64);
12257    second.extend_from_slice(b"ijklmnop");
12258
12259    let mut observed_evicted_len = None;
12260    matcher.add_data(second, |data| {
12261        observed_evicted_len = Some(data.len());
12262    });
12263
12264    assert_eq!(
12265        observed_evicted_len,
12266        Some(8),
12267        "eviction callback must report evicted byte length, not backing capacity"
12268    );
12269}
12270
12271/// Regression for the `commit_space` Dfast-branch eviction accounting bug
12272/// (CodeRabbit Critical on PR #146). Old code counted the INPUT buffer
12273/// length as `evicted_bytes` because Dfast's `add_data` callback receives
12274/// the input `Vec<u8>` for pool recycling (Dfast stores bytes in `history`,
12275/// not per-block Vecs). On the saturated-window 1:1 path the two coincide
12276/// so the previous test fixture passed by accident; this test forces the
12277/// divergent case where evicted != input by sequencing block lengths
12278/// `[4, 4, 5]` against `max_window_size = 10`:
12279///
12280///   * after 1st commit: `window_blocks = [4]`, `window_size = 4`
12281///   * after 2nd commit: `window_blocks = [4, 4]`, `window_size = 8`
12282///   * 3rd commit (5 bytes): `8 + 5 > 10` → pop one 4-byte block (evict=4),
12283///     then push 5 (window_size=9). Bug counts `5`, fix counts `4`.
12284///
12285/// The fix derives eviction from `window_size` delta + input length:
12286/// `evicted = pre + space_len - post`. Verified via the
12287/// `dictionary_retained_budget` observable: starting budget 100, after
12288/// the third commit (4 bytes actually evicted) the budget must read 96,
12289/// not 95.
12290/// Driver-path regression for the `commit_space` Dfast eviction accounting
12291/// bug. Exercises `MatchGeneratorDriver::commit_space` directly (not just
12292/// `DfastMatchGenerator::add_data`) so the assertion catches a future
12293/// regression that swaps the Dfast branch in `commit_space` back to
12294/// `evicted_bytes += data.len()` — the older draft of this regression
12295/// hand-recomputed the formula on the matcher and would pass either way.
12296///
12297/// Fixture: `max_window_size = 10`, commit sequence `[4, 4, 5]`. The
12298/// divergent case where the popped block (4 bytes) and the new input
12299/// (5 bytes) have different sizes:
12300///
12301///   * after commit `"abcd"` (4 B): window_blocks=[4], ws=4
12302///   * after commit `"efgh"` (4 B): window_blocks=[4,4], ws=8
12303///   * commit `"ijklm"` (5 B): 8+5>10 → pop front [4] (evict=4),
12304///     push 5 → window_blocks=[4,5], ws=9
12305///
12306/// `commit_space` then calls `retire_dictionary_budget(evicted)`. With
12307/// the fix `evicted=4`; with the bug it would be `evicted=5`. The
12308/// downstream `trim_after_budget_retire` cascade (which fires whenever
12309/// `retire_dictionary_budget` returns true) drives the budget further
12310/// down by trimming the now-oversize window; the final
12311/// `dictionary_retained_budget` differs between the two paths because
12312/// the cascade starting state differs (max_window_size after first
12313/// retire is `10 - evicted`).
12314///
12315/// Tracing the fix path end-to-end with starting budget = 100:
12316///   1st commit: evicted=0, no retire.
12317///   2nd commit: evicted=0, no retire.
12318///   3rd commit: evicted=4. retire(4) → budget=96, max_window=6.
12319///     trim_after_budget_retire:
12320///       iter1: ws=9 > max=6, pop [4] → ws=5, evicted=4.
12321///              retire(4) → budget=92, max_window=2.
12322///       iter2: ws=5 > max=2, pop [5] → ws=0, evicted=5.
12323///              retire(5) → budget=87, max_window=0.
12324///       iter3: ws=0, no trim, retire(0) → false, exit.
12325///   Final budget = 87. Final max_window_size = 0.
12326///
12327/// In the buggy path the 3rd commit would compute `evicted=5`, retire
12328/// would reclaim 5 instead of 4, shrinking max_window_size to 5
12329/// instead of 6 — and then the cascade arithmetic produces a
12330/// different final budget (and on the 2nd commit the cascade would
12331/// already have shrunk max_window_size to 0, causing the 3rd commit
12332/// to panic on `data.len() <= max_window_size`). Either way the
12333/// regression surfaces as a test failure.
12334#[test]
12335fn dfast_commit_space_eviction_uses_window_size_delta() {
12336    use crate::encoding::CompressionLevel;
12337
12338    let mut driver = MatchGeneratorDriver::new(10, 1);
12339    driver.reset(CompressionLevel::Level(3));
12340    assert!(matches!(driver.storage, MatcherStorage::Dfast(_)));
12341
12342    // Override the level-derived window with a tiny one so the
12343    // 4 + 4 + 5 = 13 commit sequence below actually crosses the
12344    // boundary. A 16 KiB+ default window would never evict on this
12345    // little data and the bug would stay invisible.
12346    driver.dfast_matcher_mut().max_window_size = 10;
12347    driver.dictionary_retained_budget = 100;
12348
12349    let mut space1 = Vec::with_capacity(64);
12350    space1.extend_from_slice(b"abcd");
12351    driver.commit_space(space1);
12352    assert_eq!(
12353        driver.dictionary_retained_budget, 100,
12354        "1st commit fills window 0 → 4, no eviction, no retire"
12355    );
12356
12357    let mut space2 = Vec::with_capacity(64);
12358    space2.extend_from_slice(b"efgh");
12359    driver.commit_space(space2);
12360    assert_eq!(
12361        driver.dictionary_retained_budget, 100,
12362        "2nd commit fills window 4 → 8, no eviction, no retire"
12363    );
12364
12365    let mut space3 = Vec::with_capacity(64);
12366    space3.extend_from_slice(b"ijklm");
12367    driver.commit_space(space3);
12368    assert_eq!(
12369        driver.dictionary_retained_budget, 87,
12370        "3rd commit + trim_after_budget_retire cascade. With the fix \
12371         (evicted=4 from window_size delta) the cascade reclaims 100 \
12372         → 96 → 92 → 87. With the bug (evicted=5 from data.len()) the \
12373         3rd commit would panic on `data.len() <= max_window_size` \
12374         after the 2nd commit's cascade had already shrunk \
12375         max_window_size to 0."
12376    );
12377    assert_eq!(
12378        driver.dfast_matcher_mut().max_window_size,
12379        0,
12380        "cascade drains max_window_size to 0 once budget reclaim \
12381         exceeds the initial window size"
12382    );
12383}
12384
12385#[test]
12386fn dfast_trim_to_window_evicts_oldest_block_by_length() {
12387    // After the history-only storage refactor (#111 Phase 7c step 3),
12388    // Dfast no longer retains input `Vec<u8>`s — the `history`
12389    // contiguous buffer is the sole byte store, and `add_data`
12390    // returns the input Vec to the caller's pool eagerly. So
12391    // `trim_to_window` doesn't have anything to hand back to the
12392    // closure (no Vec exists to give). The eviction is observable
12393    // instead through `window_size` shrinking by the per-block
12394    // length recorded in `window_blocks`.
12395    let mut matcher = DfastMatchGenerator::new(16);
12396
12397    let mut first = Vec::with_capacity(64);
12398    first.extend_from_slice(b"abcdefgh");
12399    matcher.add_data(first, |_| {});
12400
12401    let mut second = Vec::with_capacity(64);
12402    second.extend_from_slice(b"ijklmnop");
12403    matcher.add_data(second, |_| {});
12404
12405    assert_eq!(matcher.window_size, 16);
12406    assert_eq!(matcher.window_blocks.len(), 2);
12407
12408    matcher.max_window_size = 8;
12409
12410    matcher.trim_to_window();
12411
12412    // No callback signature to assert on: the Dfast variant of
12413    // `trim_to_window` takes none. That signature shape (vs HC/Row
12414    // which accept `impl FnMut(Vec<u8>)`) is the property locking in
12415    // the contract — there is no closure to invoke or skip, so no
12416    // future change can "start invoking the callback" without a
12417    // compile-time signature break that the dispatcher and this test
12418    // would force the author to address.
12419    assert_eq!(
12420        matcher.window_size, 8,
12421        "exactly one 8-byte block must remain"
12422    );
12423    assert_eq!(matcher.window_blocks.len(), 1);
12424    assert_eq!(matcher.history_abs_start, 8);
12425}
12426
12427#[test]
12428fn dfast_inserts_tail_positions_for_next_block_matching() {
12429    let mut matcher = DfastMatchGenerator::new(1 << 22);
12430
12431    matcher.add_data(b"012345bcdea".to_vec(), |_| {});
12432    let mut history = Vec::new();
12433    matcher.start_matching(|seq| match seq {
12434        Sequence::Literals { literals } => history.extend_from_slice(literals),
12435        Sequence::Triple { .. } => unreachable!("first block should not match history"),
12436    });
12437    assert_eq!(history, b"012345bcdea");
12438
12439    matcher.add_data(b"bcdeabcdeab".to_vec(), |_| {});
12440    let mut saw_first_sequence = false;
12441    matcher.start_matching(|seq| {
12442        assert!(!saw_first_sequence, "expected a single cross-block match");
12443        saw_first_sequence = true;
12444        match seq {
12445            Sequence::Literals { .. } => {
12446                panic!("expected tail-anchored cross-block match before any literals")
12447            }
12448            Sequence::Triple {
12449                literals,
12450                offset,
12451                match_len,
12452            } => {
12453                assert_eq!(literals, b"");
12454                assert_eq!(offset, 5);
12455                assert_eq!(match_len, 11);
12456                let start = history.len() - offset;
12457                for i in 0..match_len {
12458                    let byte = history[start + i];
12459                    history.push(byte);
12460                }
12461            }
12462        }
12463    });
12464
12465    assert!(
12466        saw_first_sequence,
12467        "expected tail-anchored cross-block match"
12468    );
12469    assert_eq!(history, b"012345bcdeabcdeabcdeab");
12470}
12471
12472/// Regression for #49 — locks down `MatchTable::backfill_boundary_positions`
12473/// for the [`HcMatchGenerator`] lazy path. `backfill_boundary_positions`
12474/// seeds ONLY the last `< 4` bytes of the previous slice (positions in
12475/// `[current_abs_start - 3, current_abs_start)`) — the bytes that
12476/// `insert_position` could not hash at the time because hashing needs
12477/// 4 bytes of lookahead. The existing 8 MiB window roundtrip test
12478/// exercises cross-slice behaviour end-to-end, but does not isolate
12479/// the backfill of those final 1-3 unhashable bytes.
12480///
12481/// Fixture is built so the cross-block match's candidate position
12482/// MUST lie in `[block_1_end - 3, block_1_end)`:
12483///
12484/// - Block 1 = `b"PQRSTBCD"` (8 bytes). Block 1's `start_matching`
12485///   hashes positions 0..=4 (each has 4 bytes of forward context);
12486///   positions 5/6/7 are the unhashable tail.
12487/// - Block 2 = `b"BCDBCDBCDB"` (10 bytes). At absolute position 8
12488///   (block 2 start) the 4-byte window is `b"BCDB"`. The ONLY place
12489///   `b"BCDB"` was inserted in the hash + chain tables is position 5
12490///   — via `backfill_boundary_positions` on the next-slice entry
12491///   (the 4-byte window at position 5 is `data[5..9] = b"BCD" +
12492///   block_2[0] = b"BCDB"`).
12493///
12494/// If `backfill_boundary_positions` regresses, position 5 is never
12495/// hashed, position 8's lookup misses, and the lazy parser falls
12496/// through to a leading literals run — `offset == 3, match_len >= 4`
12497/// would no longer hold.
12498#[test]
12499fn hashchain_inserts_tail_positions_for_next_block_matching() {
12500    let mut matcher = HcMatchGenerator::new(1 << 22);
12501    matcher.configure(HC_CONFIG, super::strategy::StrategyTag::Lazy, 22);
12502
12503    matcher.table.add_data(b"PQRSTBCD".to_vec(), |_| {});
12504    let mut history = alloc::vec::Vec::new();
12505    matcher.start_matching(|seq| match seq {
12506        Sequence::Literals { literals } => history.extend_from_slice(literals),
12507        Sequence::Triple { .. } => unreachable!("first block has no internal repeats"),
12508    });
12509    assert_eq!(history, b"PQRSTBCD");
12510
12511    matcher.table.add_data(b"BCDBCDBCDB".to_vec(), |_| {});
12512    let mut first_sequence_offset: Option<usize> = None;
12513    let mut first_sequence_match_len: Option<usize> = None;
12514    matcher.start_matching(|seq| {
12515        if first_sequence_offset.is_some() {
12516            return;
12517        }
12518        match seq {
12519            Sequence::Literals { .. } => {
12520                panic!(
12521                    "expected tail-anchored cross-block match before any literals — \
12522                     backfill_boundary_positions did not seed positions 5/6/7"
12523                )
12524            }
12525            Sequence::Triple {
12526                literals,
12527                offset,
12528                match_len,
12529            } => {
12530                assert_eq!(literals, b"", "no leading literals on the boundary match");
12531                first_sequence_offset = Some(offset);
12532                first_sequence_match_len = Some(match_len);
12533            }
12534        }
12535    });
12536
12537    let offset = first_sequence_offset.expect(
12538        "expected tail-anchored cross-block match emitted from backfill_boundary_positions",
12539    );
12540    assert!(
12541        (1..=3).contains(&offset),
12542        "boundary match offset {offset} must point into the unhashable tail \
12543         (positions 5/6/7 of an 8-byte block 1) so the test specifically \
12544         locks down backfill_boundary_positions",
12545    );
12546    assert_eq!(
12547        offset, 3,
12548        "candidate position must land at 5 (= block_1_len - 3) so the 4-byte \
12549         window `data[5..9] = b\"BCDB\"` matches block 2's first hash lookup",
12550    );
12551    let match_len = first_sequence_match_len.unwrap();
12552    assert!(
12553        match_len >= HC_MIN_MATCH_LEN,
12554        "match_len {match_len} must clear the HC min-match floor",
12555    );
12556}
12557
12558#[test]
12559fn dfast_dense_skip_matching_backfills_previous_tail_for_next_block() {
12560    let mut matcher = DfastMatchGenerator::new(1 << 22);
12561    let tail = b"Qz9kLm2Rp";
12562    let mut first = b"0123456789abcdef".to_vec();
12563    first.extend_from_slice(tail);
12564    matcher.add_data(first.clone(), |_| {});
12565    matcher.skip_matching(Some(false));
12566
12567    let mut second = tail.to_vec();
12568    second.extend_from_slice(b"after-tail-literals");
12569    matcher.add_data(second, |_| {});
12570
12571    let mut first_sequence = None;
12572    matcher.start_matching(|seq| {
12573        if first_sequence.is_some() {
12574            return;
12575        }
12576        first_sequence = Some(match seq {
12577            Sequence::Literals { literals } => (literals.len(), 0usize, 0usize),
12578            Sequence::Triple {
12579                literals,
12580                offset,
12581                match_len,
12582            } => (literals.len(), offset, match_len),
12583        });
12584    });
12585
12586    let (lit_len, offset, match_len) = first_sequence.expect("expected at least one sequence");
12587    assert_eq!(
12588        lit_len, 0,
12589        "expected immediate cross-block match at block start"
12590    );
12591    assert_eq!(
12592        offset,
12593        tail.len(),
12594        "expected dense skip to preserve cross-boundary tail match"
12595    );
12596    assert!(
12597        match_len >= DFAST_MIN_MATCH_LEN,
12598        "match length should satisfy dfast minimum match length"
12599    );
12600}
12601
12602#[test]
12603fn dfast_sparse_skip_matching_preserves_tail_cross_block_match() {
12604    let mut matcher = DfastMatchGenerator::new(1 << 22);
12605    let tail = b"Qz9kLm2Rp";
12606    let mut first = deterministic_high_entropy_bytes(0x9E37_79B9_7F4A_7C15, 4096);
12607    let tail_start = first.len() - tail.len();
12608    first[tail_start..].copy_from_slice(tail);
12609    matcher.add_data(first.clone(), |_| {});
12610
12611    matcher.skip_matching(Some(true));
12612
12613    let mut second = tail.to_vec();
12614    second.extend_from_slice(b"after-tail-literals");
12615    matcher.add_data(second, |_| {});
12616
12617    let mut first_sequence = None;
12618    matcher.start_matching(|seq| {
12619        if first_sequence.is_some() {
12620            return;
12621        }
12622        first_sequence = Some(match seq {
12623            Sequence::Literals { literals } => (literals.len(), 0usize, 0usize),
12624            Sequence::Triple {
12625                literals,
12626                offset,
12627                match_len,
12628            } => (literals.len(), offset, match_len),
12629        });
12630    });
12631
12632    let (lit_len, offset, match_len) = first_sequence.expect("expected at least one sequence");
12633    assert_eq!(
12634        lit_len, 0,
12635        "expected immediate cross-block match at block start"
12636    );
12637    assert_eq!(
12638        offset,
12639        tail.len(),
12640        "expected match against densely seeded tail"
12641    );
12642    assert!(
12643        match_len >= DFAST_MIN_MATCH_LEN,
12644        "match length should satisfy dfast minimum match length"
12645    );
12646}
12647
12648#[test]
12649fn dfast_skip_matching_dense_backfills_newly_hashable_long_tail_positions() {
12650    let mut matcher = DfastMatchGenerator::new(1 << 22);
12651    let first = deterministic_high_entropy_bytes(0x7A64_0315_D4E1_91C3, 4096);
12652    let first_len = first.len();
12653    matcher.add_data(first, |_| {});
12654    matcher.skip_matching_dense();
12655
12656    // Appending one byte makes exactly the previous block's last 7 starts
12657    // newly eligible for 8-byte long-hash insertion.
12658    matcher.add_data(alloc::vec![0xAB], |_| {});
12659    matcher.skip_matching_dense();
12660
12661    let target_abs_pos = first_len - 7;
12662    let target_rel = target_abs_pos - matcher.history_abs_start;
12663    let live = matcher.live_history();
12664    assert!(
12665        target_rel + 8 <= live.len(),
12666        "fixture must make the boundary start long-hashable"
12667    );
12668    let long_hash = matcher.long_hash_index(&live[target_rel..]);
12669    let target_slot = matcher.pack_slot(target_abs_pos);
12670    // Single-slot tables (upstream zstd parity): the bucket holds at most one
12671    // u32; the assertion below is a direct equality (no `.contains`).
12672    assert_ne!(
12673        target_slot, DFAST_EMPTY_SLOT,
12674        "pack_slot must never return the empty-slot sentinel for a real position"
12675    );
12676    assert_eq!(
12677        matcher.tables[long_hash], target_slot,
12678        "dense skip must seed long-hash entry for newly hashable boundary start"
12679    );
12680}
12681
12682#[test]
12683fn dfast_seed_remaining_hashable_starts_seeds_last_short_hash_positions() {
12684    let mut matcher = DfastMatchGenerator::new(1 << 20);
12685    let block = deterministic_high_entropy_bytes(0x13F0_9A6D_55CE_7B21, 64);
12686    matcher.add_data(block, |_| {});
12687    matcher.ensure_hash_tables();
12688
12689    let current_len = matcher.window_blocks.back().copied().unwrap_or(0);
12690    let current_abs_start = matcher.history_abs_start + matcher.window_size - current_len;
12691    let seed_start = current_len - DFAST_MIN_MATCH_LEN;
12692    matcher.seed_remaining_hashable_starts(current_abs_start, current_len, seed_start);
12693
12694    let target_abs_pos = current_abs_start + current_len - 5;
12695    let target_rel = target_abs_pos - matcher.history_abs_start;
12696    let live = matcher.live_history();
12697    assert!(
12698        target_rel + 5 <= live.len(),
12699        "fixture must leave the last short-hash start valid"
12700    );
12701    let short_hash = matcher.short_hash_index(&live[target_rel..]);
12702    let target_slot = matcher.pack_slot(target_abs_pos);
12703    assert_ne!(
12704        target_slot, DFAST_EMPTY_SLOT,
12705        "pack_slot must never return the empty-slot sentinel for a real position"
12706    );
12707    assert_eq!(
12708        matcher.tables[matcher.long_len() + short_hash],
12709        target_slot,
12710        "tail seeding must include the last 5-byte-hashable start"
12711    );
12712}
12713
12714#[test]
12715fn dfast_seed_remaining_hashable_starts_handles_pos_at_block_end() {
12716    let mut matcher = DfastMatchGenerator::new(1 << 20);
12717    let block = deterministic_high_entropy_bytes(0x7BB2_DA91_441E_C0EF, 64);
12718    matcher.add_data(block, |_| {});
12719    matcher.ensure_hash_tables();
12720
12721    let current_len = matcher.window_blocks.back().copied().unwrap_or(0);
12722    let current_abs_start = matcher.history_abs_start + matcher.window_size - current_len;
12723    matcher.seed_remaining_hashable_starts(current_abs_start, current_len, current_len);
12724
12725    let target_abs_pos = current_abs_start + current_len - 5;
12726    let target_rel = target_abs_pos - matcher.history_abs_start;
12727    let live = matcher.live_history();
12728    assert!(
12729        target_rel + 5 <= live.len(),
12730        "fixture must leave the last short-hash start valid"
12731    );
12732    let short_hash = matcher.short_hash_index(&live[target_rel..]);
12733    let target_slot = matcher.pack_slot(target_abs_pos);
12734    assert_ne!(
12735        target_slot, DFAST_EMPTY_SLOT,
12736        "pack_slot must never return the empty-slot sentinel for a real position"
12737    );
12738    assert_eq!(
12739        matcher.tables[matcher.long_len() + short_hash],
12740        target_slot,
12741        "tail seeding must still include the last 5-byte-hashable start when pos is at block end"
12742    );
12743}
12744
12745/// `ensure_room_for` must trigger `reduce()` when the requested
12746/// absolute position would push a relative offset past
12747/// `u32::MAX - DFAST_REBASE_GUARD_BAND`. After the rebase, the
12748/// pre-existing entry at a much-smaller absolute position falls
12749/// below `reducer` and gets cleared to `DFAST_EMPTY_SLOT`; a fresh
12750/// insert at the boundary position must `pack_slot` to a valid
12751/// non-sentinel value that `unpack_slot` resolves back to the same
12752/// absolute position. Mirrors `LdmHashTable::ensure_room_for_*`
12753/// from PR #139.
12754///
12755/// Runs on every target — `trigger_abs = u32::MAX -
12756/// DFAST_REBASE_GUARD_BAND + 1 = 0xC0000000`, which fits in `usize`
12757/// on i686 (`usize::MAX = u32::MAX`) without overflow, so the
12758/// packed-slot boundary path + u32 ↔ usize round-trip is exercised
12759/// on every pointer width we ship.
12760#[test]
12761fn dfast_ensure_room_for_rebases_above_guard_band() {
12762    let mut dfast = DfastMatchGenerator::new(1 << 22);
12763    dfast.set_hash_bits(10, 10);
12764    dfast.ensure_hash_tables();
12765
12766    // Seed an early insert near the current base in BOTH tables.
12767    // `ensure_room_for` / `reduce` is a shared contract for both
12768    // `short_hash` and `long_hash`; without seeding both, a
12769    // regression that only cleared short_hash would still pass.
12770    // Direct `pack_slot` + bucket write keeps the test focused on
12771    // the rebase mechanics and avoids dragging in the full
12772    // `insert_position` flow with its history/window setup.
12773    let early_abs = 1024usize;
12774    let early_packed = dfast.pack_slot(early_abs);
12775    assert_ne!(early_packed, DFAST_EMPTY_SLOT);
12776    let short0 = dfast.long_len();
12777    dfast.tables[short0] = early_packed;
12778    dfast.tables[0] = early_packed;
12779
12780    // Pick a trigger position that forces the first rebase. With
12781    // `position_base = 0`, the smallest `abs_pos` that fails the
12782    // `rel <= max_rel` test is `u32::MAX - DFAST_REBASE_GUARD_BAND
12783    // + 1`. After one `reduce(DFAST_REBASE_GUARD_BAND)` the base
12784    // advances by `DFAST_REBASE_GUARD_BAND`.
12785    let trigger_abs = (u32::MAX as usize) - (DFAST_REBASE_GUARD_BAND as usize) + 1;
12786    assert_eq!(dfast.position_base, 0);
12787    dfast.ensure_room_for(trigger_abs);
12788    assert_eq!(
12789        dfast.position_base, DFAST_REBASE_GUARD_BAND as usize,
12790        "rebase must advance position_base by DFAST_REBASE_GUARD_BAND"
12791    );
12792
12793    // The early entry at abs=1024 had packed slot 1025; the rebase
12794    // subtracts `DFAST_REBASE_GUARD_BAND` (= 2^30) from every slot.
12795    // 1025 <= 2^30 so the slot drops to the empty sentinel —
12796    // upstream zstd parity for `ZSTD_window_reduce`'s clamp-at-zero rule.
12797    // Verify BOTH tables — `reduce()` walks them in sequence.
12798    assert_eq!(
12799        dfast.tables[dfast.long_len()],
12800        DFAST_EMPTY_SLOT,
12801        "pre-rebase short-hash entries below the reducer must become empty"
12802    );
12803    assert_eq!(
12804        dfast.tables[0], DFAST_EMPTY_SLOT,
12805        "pre-rebase long-hash entries below the reducer must become empty"
12806    );
12807
12808    // A fresh insert past the rebase boundary must round-trip:
12809    // pack to a non-sentinel value, then unpack back to the same
12810    // absolute position via `position_base + slot - 1`.
12811    let post_packed = dfast.pack_slot(trigger_abs);
12812    assert_ne!(post_packed, DFAST_EMPTY_SLOT);
12813    let unpacked = dfast.position_base + (post_packed as usize) - 1;
12814    assert_eq!(
12815        unpacked, trigger_abs,
12816        "post-rebase pack/unpack must round-trip the absolute position"
12817    );
12818}
12819
12820#[test]
12821fn dfast_sparse_skip_matching_backfills_previous_tail_for_consecutive_sparse_blocks() {
12822    let mut matcher = DfastMatchGenerator::new(1 << 22);
12823    let boundary_prefix = [0xFA, 0xFB, 0xFC];
12824    let boundary_suffix = [0xFD, 0xEE, 0xAD, 0xBE, 0xEF, 0x11, 0x22, 0x33];
12825
12826    let mut first = deterministic_high_entropy_bytes(0xA5A5_5A5A_C3C3_3C3C, 4096);
12827    let first_tail_start = first.len() - boundary_prefix.len();
12828    first[first_tail_start..].copy_from_slice(&boundary_prefix);
12829    matcher.add_data(first, |_| {});
12830    matcher.skip_matching(Some(true));
12831
12832    let mut second = deterministic_high_entropy_bytes(0xA5A5_5A5A_C3C3_3C3C, 4096);
12833    second[..boundary_suffix.len()].copy_from_slice(&boundary_suffix);
12834    matcher.add_data(second.clone(), |_| {});
12835    matcher.skip_matching(Some(true));
12836
12837    let mut third = boundary_prefix.to_vec();
12838    third.extend_from_slice(&boundary_suffix);
12839    third.extend_from_slice(b"-trailing-literals");
12840    matcher.add_data(third, |_| {});
12841
12842    let mut first_sequence = None;
12843    matcher.start_matching(|seq| {
12844        if first_sequence.is_some() {
12845            return;
12846        }
12847        first_sequence = Some(match seq {
12848            Sequence::Literals { literals } => (literals.len(), 0usize, 0usize),
12849            Sequence::Triple {
12850                literals,
12851                offset,
12852                match_len,
12853            } => (literals.len(), offset, match_len),
12854        });
12855    });
12856
12857    let (lit_len, offset, match_len) = first_sequence.expect("expected at least one sequence");
12858    assert_eq!(
12859        lit_len, 0,
12860        "expected immediate match from the prior sparse-skip boundary"
12861    );
12862    assert_eq!(
12863        offset,
12864        second.len() + boundary_prefix.len(),
12865        "expected match against backfilled first→second boundary start"
12866    );
12867    assert!(
12868        match_len >= DFAST_MIN_MATCH_LEN,
12869        "match length should satisfy dfast minimum match length"
12870    );
12871}
12872
12873#[test]
12874fn fastest_hint_iteration_23_sequences_reconstruct_source() {
12875    fn generate_data(seed: u64, len: usize) -> Vec<u8> {
12876        let mut state = seed;
12877        let mut data = Vec::with_capacity(len);
12878        for _ in 0..len {
12879            state = state
12880                .wrapping_mul(6364136223846793005)
12881                .wrapping_add(1442695040888963407);
12882            data.push((state >> 33) as u8);
12883        }
12884        data
12885    }
12886
12887    let i = 23u64;
12888    let len = (i * 89 % 16384) as usize;
12889    let mut data = generate_data(i, len);
12890    // Append a repeated slice so the fixture deterministically exercises
12891    // the match path (Sequence::Triple) instead of only literals.
12892    let repeat = data[128..256].to_vec();
12893    data.extend_from_slice(&repeat);
12894    data.extend_from_slice(&repeat);
12895
12896    let mut driver = MatchGeneratorDriver::new(1024 * 128, 1);
12897    driver.set_source_size_hint(data.len() as u64);
12898    driver.reset(CompressionLevel::Fastest);
12899    let mut space = driver.get_next_space();
12900    space[..data.len()].copy_from_slice(&data);
12901    space.truncate(data.len());
12902    driver.commit_space(space);
12903
12904    let mut rebuilt = Vec::with_capacity(data.len());
12905    let mut saw_triple = false;
12906    driver.start_matching(|seq| match seq {
12907        Sequence::Literals { literals } => rebuilt.extend_from_slice(literals),
12908        Sequence::Triple {
12909            literals,
12910            offset,
12911            match_len,
12912        } => {
12913            saw_triple = true;
12914            rebuilt.extend_from_slice(literals);
12915            assert!(offset > 0, "offset must be non-zero");
12916            assert!(
12917                offset <= rebuilt.len(),
12918                "offset must reference already-produced bytes: offset={} produced={}",
12919                offset,
12920                rebuilt.len()
12921            );
12922            let start = rebuilt.len() - offset;
12923            for idx in 0..match_len {
12924                let b = rebuilt[start + idx];
12925                rebuilt.push(b);
12926            }
12927        }
12928    });
12929
12930    // Whether THIS specific iteration produces a Triple depends on
12931    // the matcher's step-skip schedule (upstream zstd-shape kernel walks ip0
12932    // with kSearchStrength-driven stride growth) — the legacy
12933    // SuffixStore-based matcher iterated every position and always
12934    // hit short repeats, but the upstream zstd-shape kernel may skip over
12935    // them when the step has grown large by the time it reaches the
12936    // repeat region. The substance of this test is the
12937    // reconstruction assertion below; `saw_triple` was a legacy
12938    // tuning preference, not a correctness invariant.
12939    let _ = saw_triple;
12940    assert_eq!(rebuilt, data);
12941}
12942
12943#[test]
12944fn fast_levels_dispatch_per_level_hash_log_and_mls() {
12945    // Level 1 — upstream zstd `{ 19, 13, 14, 1, 7, 0, ZSTD_fast }` row:
12946    // window_log=19, hash_log=14, mls=7.
12947    let f1 = resolve_level_params(CompressionLevel::Level(1), None)
12948        .fast
12949        .unwrap();
12950    assert_eq!(f1.hash_log, 14);
12951    assert_eq!(f1.mls, 7);
12952    assert_eq!(f1.step_size, 2);
12953
12954    // Negative levels — upstream zstd row-0 ("base for negative"):
12955    // hash_log=13, mls=7. The 32 KiB table is L1d-resident (every
12956    // probe an L1 hit, vs an L2 access for a 64 KiB hash_log=14
12957    // table), and minMatch=7 drops short-distance 6-byte matches —
12958    // upstream zstd parity on both ratio and throughput.
12959    // step_size follows upstream zstd's formula: targetLength = -level,
12960    // step_size = (-level) + 1, giving 2..8 for L-1..L-7.
12961    for n in -7..=-1 {
12962        let f = resolve_level_params(CompressionLevel::Level(n), None)
12963            .fast
12964            .unwrap();
12965        assert_eq!(f.hash_log, 13, "Level({n}) fast_hash_log");
12966        assert_eq!(f.mls, 7, "Level({n}) fast_mls");
12967        let expected_step = ((-n) as usize) + 1;
12968        assert_eq!(f.step_size, expected_step, "Level({n}) fast_step_size");
12969    }
12970
12971    // Fastest + Uncompressed keep hash_log=14 / mls=6 (their own
12972    // tuning; not part of the negative-level upstream zstd ladder).
12973    let pf = resolve_level_params(CompressionLevel::Fastest, None);
12974    let ff = pf.fast.unwrap();
12975    assert_eq!(
12976        (pf.window_log, ff.hash_log, ff.mls, ff.step_size),
12977        (19, 14, 6, 2),
12978    );
12979    // Uncompressed keeps window_log=17 (no history references, smaller
12980    // decoder reservation); fast cParams same as negative-base row.
12981    let pu = resolve_level_params(CompressionLevel::Uncompressed, None);
12982    let fu = pu.fast.unwrap();
12983    assert_eq!(
12984        (pu.window_log, fu.hash_log, fu.mls, fu.step_size),
12985        (17, 14, 6, 2),
12986    );
12987}
12988
12989/// Exercise the actual driver wiring: for every Fast level, reset a
12990/// `MatchGeneratorDriver` and assert the inner `FastKernelMatcher`
12991/// observed the same `(hash_log, mls, step_size)` tuple that
12992/// `resolve_level_params` reports. Catches plumbing bugs — argument
12993/// reordering, stale step_size carried from a prior frame,
12994/// stuck-on-default values — that the parameter-only test above
12995/// would miss.
12996#[test]
12997fn fast_levels_driver_wiring_threads_cparams_into_inner_matcher() {
12998    let mut driver = MatchGeneratorDriver::new(64 * 1024, 1);
12999
13000    let fast_levels = [
13001        CompressionLevel::Level(1),
13002        CompressionLevel::Fastest,
13003        CompressionLevel::Uncompressed,
13004        CompressionLevel::Level(-1),
13005        CompressionLevel::Level(-2),
13006        CompressionLevel::Level(-3),
13007        CompressionLevel::Level(-4),
13008        CompressionLevel::Level(-5),
13009        CompressionLevel::Level(-6),
13010        CompressionLevel::Level(-7),
13011    ];
13012
13013    for &level in &fast_levels {
13014        let p = resolve_level_params(level, None);
13015        // Sanity: every level in the table above must resolve to a
13016        // Fast-strategy row — otherwise this test isn't testing what
13017        // it claims to test.
13018        assert_eq!(
13019            p.strategy_tag,
13020            super::strategy::StrategyTag::Fast,
13021            "{level:?} must resolve to Fast strategy",
13022        );
13023
13024        // Bounce through a non-Fast strategy first so the next
13025        // reset actually goes through the backend-switch path
13026        // (`MatchGeneratorDriver::new` / `simple_mut` recreate the
13027        // Fast variant via `FastKernelMatcher::with_params`). Without
13028        // this hop the loop would only ever stay in `BackendTag::Simple`
13029        // and exercise `FastKernelMatcher::reset` — leaving the
13030        // `with_params` wiring untested on the production path.
13031        // `Default` resolves to Dfast strategy (a non-Fast row),
13032        // which is enough to force the swap.
13033        crate::encoding::Matcher::reset(&mut driver, CompressionLevel::Default);
13034
13035        // Drive the production reset path (same code paths exercised
13036        // by FrameCompressor / StreamingEncoder).
13037        crate::encoding::Matcher::reset(&mut driver, level);
13038
13039        let f = p.fast.unwrap();
13040        let m = driver.simple_mut();
13041        assert_eq!(
13042            m.hash_log(),
13043            f.hash_log,
13044            "{level:?}: inner matcher hash_log mismatch — argument swap?",
13045        );
13046        assert_eq!(
13047            m.mls(),
13048            f.mls,
13049            "{level:?}: inner matcher mls mismatch — argument swap?",
13050        );
13051        assert_eq!(
13052            m.step_size(),
13053            f.step_size,
13054            "{level:?}: inner matcher step_size mismatch — stale value carried from prior reset?",
13055        );
13056    }
13057}
13058
13059/// Pins `hc.target_len` to the reference `cParams.targetLength` from
13060/// `clevels.h` table[0] (default — `srcSize > 256 KB`) across levels
13061/// 5-15. The reference's lazy outer loop treats `targetLength` as
13062/// `sufficient_len` — the "nice match" threshold that breaks the chain
13063/// walk as soon as a candidate reaches that length.
13064///
13065/// Levels 13-15 run btlazy2 in the reference and the hash-chain Lazy
13066/// parser here, but the reference `targetLength` (32) is the same nice-match
13067/// threshold for both finders, so we mirror it directly.
13068///
13069/// Asserts against the constant `clevels.h` table[0] `targetLength` column
13070/// (transcribed inline) — a pure-Rust in-tree test, no FFI dependency.
13071#[test]
13072fn lazy_band_target_len_matches_default_table() {
13073    // table[0] (srcSize > 256 KB) targetLength, levels 5..=15: the lazy
13074    // outer loop's nice-match (`sufficient_len`) threshold.
13075    let expected: [(i32, usize); 11] = [
13076        (5, 2),
13077        (6, 4),
13078        (7, 8),
13079        (8, 16),
13080        (9, 16),
13081        (10, 16),
13082        (11, 16),
13083        (12, 32),
13084        (13, 32),
13085        (14, 32),
13086        (15, 32),
13087    ];
13088    for (level, want) in expected {
13089        let params = resolve_level_params(CompressionLevel::Level(level), None);
13090        // L5 = greedy (Row backend → `row`); L6-15 = lazy (HashChain → `hc`).
13091        let target_len = params
13092            .hc
13093            .map(|hc| hc.target_len)
13094            .or_else(|| params.row.map(|row| row.target_len))
13095            .expect("lazy/greedy level carries hc or row config");
13096        assert_eq!(target_len, want, "L{level}: target_len must match table[0]");
13097    }
13098}
13099
13100/// Levels 13-15 mirror the reference btlazy2 window/hash/chain/search
13101/// budget from `clevels.h` table[0]: `search_depth == 1 << cParams.searchLog`
13102/// (16 / 32 / 64) plus `window_log` / `hash_log` / `chain_log` equal to the
13103/// reference `windowLog` / `hashLog` / `chainLog`. We run them on the
13104/// hash-chain Lazy parser rather than a binary-tree finder, so they do not
13105/// re-establish a strict ratio ladder above L12 on window-fitting inputs;
13106/// asserting the full row (not just `search_depth`) keeps the whole budget
13107/// aligned and guards every field against silent drift.
13108#[test]
13109fn upper_lazy_band_params_match_default_table() {
13110    // table[0] (srcSize > 256 KB), levels 13..=15 (btlazy2 budget):
13111    // (level, windowLog, hashLog, chainLog, search_depth = 1 << searchLog).
13112    let expected: [(i32, u8, usize, usize, usize); 3] = [
13113        (13, 22, 22, 22, 1 << 4),
13114        (14, 22, 23, 22, 1 << 5),
13115        (15, 22, 23, 23, 1 << 6),
13116    ];
13117    for (level, wlog, hlog, clog, sd) in expected {
13118        let params = resolve_level_params(CompressionLevel::Level(level), None);
13119        let hc = params.hc.unwrap();
13120        assert_eq!(hc.search_depth, sd, "L{level}: search_depth");
13121        assert_eq!(params.window_log, wlog, "L{level}: window_log");
13122        assert_eq!(hc.hash_log, hlog, "L{level}: hash_log");
13123        assert_eq!(hc.chain_log, clog, "L{level}: chain_log");
13124    }
13125}
structured_zstd/encoding/match_generator.rs

structured_zstd/encoding/
match_generator.rs