structured_zstd/encoding/
match_generator.rs

1//! Matching algorithm used find repeated parts in the original data
2//!
3//! The Zstd format relies on finden repeated sequences of data and compressing these sequences as instructions to the decoder.
4//! A sequence basically tells the decoder "Go back X bytes and copy Y bytes to the end of your decode buffer".
5//!
6//! The task here is to efficiently find matches in the already encoded data for the current suffix of the not yet encoded data.
7
8use alloc::vec::Vec;
9// SIMD/CRC intrinsics now live in `crate::encoding::fastpath::*` where they
10// sit under per-CPU `#[target_feature]` umbrellas; no architecture-specific
11// intrinsic imports remain in this file.
12use super::CompressionLevel;
13use super::Matcher;
14use super::Sequence;
15use super::blocks::encode_offset_with_history;
16use super::bt::BtMatcher;
17#[cfg(test)]
18use super::cost_model::HC_MAX_LIT;
19use super::cost_model::{
20    HC_BITCOST_MULTIPLIER, HC_FORMAT_MINMATCH, HC_OPT_NODE_LEN, HC_OPT_NUM, HC_OPT_PRICE_ARENA_LEN,
21    HC_OPT_PRICE_STRIDE, HC_PREDEF_THRESHOLD, HcOptState, HcOptimalCostProfile,
22};
23#[cfg(test)]
24use super::cost_model::{HC_BLOCKSIZE_MAX, HC_MAX_LL, HC_MAX_ML, HC_MAX_OFF, HcOptPriceType};
25use super::dfast::DfastMatchGenerator;
26// FAST_HASH_FILL_STEP test-only re-export was tied to the legacy
27// SuffixStore MatchGenerator's interleaved hash-fill stride. The
28// upstream zstd-shape Fast kernel walks ip0 with kSearchStrength step-skip
29// acceleration instead, so the constant has no consumer in the
30// remaining live test set today.
31#[cfg(test)]
32use super::match_table::helpers::INCOMPRESSIBLE_SKIP_STEP;
33use super::match_table::helpers::MIN_MATCH_LEN;
34#[cfg(test)]
35use super::match_table::helpers::common_prefix_len;
36#[cfg(test)]
37use super::opt::ldm::HcRawSeq;
38use super::opt::ldm::{HcOptLdmState, HcRawSeqStore};
39use super::opt::types::{
40    HcCandidateQuery, HcOptimalNode, HcOptimalPlanBuffers, HcOptimalPlanState, HcOptimalSequence,
41    MatchCandidate,
42};
43use super::row::RowMatchGenerator;
44use super::simple::fast_matcher::{FAST_LEVEL_1_HASH_LOG, FAST_LEVEL_1_MLS, FastKernelMatcher};
45#[cfg(all(
46    test,
47    feature = "std",
48    target_arch = "aarch64",
49    target_endian = "little"
50))]
51use std::arch::is_aarch64_feature_detected;
52#[cfg(all(test, feature = "std", target_arch = "x86_64"))]
53use std::arch::is_x86_feature_detected;
54
55pub(crate) const DFAST_MIN_MATCH_LEN: usize = 5;
56// Bytes the dfast short hash reads (upstream zstd `mls = 5`). Seeding / lookahead
57// guards use it so a position is only short-hashed once its full 5-byte key
58// is in range.
59pub(crate) const DFAST_SHORT_HASH_LOOKAHEAD: usize = 5;
60pub(crate) const ROW_MIN_MATCH_LEN: usize = 5;
61// Upstream zstd `clevels.h:31` at level 3 large-input bucket sets
62// `hashLog = 17` (the long-hash table) and `chainLog = 16` (the
63// short-hash table — upstream zstd names this `chainTable` even though for
64// dfast it's used as a plain single-slot hash). Each table holds one
65// `U32` per slot; the upstream zstd overwrites on collision and recovers
66// compression quality via the inline `_search_next_long` retry
67// (after a short-hash hit, probes `hashLong[hl1]` at `ip + 1` and
68// keeps the longer match).
69//
70// We mirror that storage layout: single `u32` per bucket (no
71// `[u32; N]` array), `long_hash` sized `1 << DFAST_HASH_BITS` and
72// `short_hash` one bit smaller via `DFAST_SHORT_HASH_BITS_DELTA`.
73// Two-table footprint at Level 3: `2^17 × 4 + 2^16 × 4 = 768 KiB`,
74// exact upstream parity. The `_search_next_long` retry lives in
75// `DfastMatchGenerator::hash_candidate` (called via
76// `best_match`). Earlier revisions kept a
77// 4-slot bucket per hash position; that paid 4× the upstream zstd memory
78// without measurable ratio gain once the retry was in place.
79//
80// `dfast_hash_bits_for_window` still clamps the runtime long-hash
81// value to `[MIN_WINDOW_LOG, DFAST_HASH_BITS]`, so this const is the
82// upper bound rather than a fixed default.
83pub(crate) const DFAST_HASH_BITS: usize = 17;
84/// Difference between `long_hash_bits` and `short_hash_bits` —
85/// upstream zstd `hashLog - chainLog` is 1 at every dfast level (`clevels.h`
86/// level 2: 16-15=1; level 3: 17-16=1). The short hash is one bit
87/// smaller than the long hash so the per-bucket footprint matches
88/// upstream zstd sizing exactly.
89pub(crate) const DFAST_SHORT_HASH_BITS_DELTA: usize = 1;
90/// Sentinel value for an empty slot in the dfast hash tables. Real
91/// positions are stored as `(abs_pos - position_base + 1) as u32`, so
92/// `0` is reserved as the "empty" marker and a true relative offset
93/// of `0` never appears in the table. Mirrors the LDM table's
94/// `LdmEntry.offset == 0` convention (see `encoding/ldm/table.rs`)
95/// so both rebasing structures share
96/// one sentinel scheme.
97pub(crate) const DFAST_EMPTY_SLOT: u32 = 0;
98
99/// Guard band reserved above the high-water mark before triggering a
100/// rebase on the Dfast hash tables. When the next insert would push a
101/// relative offset above `u32::MAX - DFAST_REBASE_GUARD_BAND`, the
102/// table calls `reduce(GUARD_BAND)` to shift every slot down and
103/// advance `position_base` so future inserts stay inside the `u32`
104/// window. Same scheme as `encoding/ldm/table.rs`.
105pub(crate) const DFAST_REBASE_GUARD_BAND: u32 = 1u32 << 30;
106pub(crate) const DFAST_SKIP_SEARCH_STRENGTH: usize = 6;
107pub(crate) const DFAST_SKIP_STEP_GROWTH_INTERVAL: usize = 1 << DFAST_SKIP_SEARCH_STRENGTH;
108pub(crate) const DFAST_MAX_SKIP_STEP: usize = 8;
109pub(crate) const DFAST_INCOMPRESSIBLE_SKIP_STEP: usize = 16;
110pub(crate) const ROW_HASH_BITS: usize = 20;
111pub(crate) const ROW_LOG: usize = 5;
112pub(crate) const ROW_SEARCH_DEPTH: usize = 16;
113pub(crate) const ROW_TARGET_LEN: usize = 48;
114pub(crate) const ROW_TAG_BITS: usize = 8;
115pub(crate) const ROW_EMPTY_SLOT: u32 = u32::MAX;
116pub(crate) const ROW_HASH_KEY_LEN: usize = 4;
117// HASH_MIX_PRIME now lives in `crate::encoding::fastpath::scalar`; the four
118// per-CPU `hash_mix_u64` variants share it via that module.
119// HC_PRIME3BYTES / HC_PRIME4BYTES moved to match_table::storage
120// alongside the hash helpers in Phase 1e Stage A. Only the test
121// module references the constants directly (production code goes
122// through `MatchTable::hash_value_with_mls`).
123#[cfg(test)]
124use super::match_table::storage::{HC_PRIME3BYTES, HC_PRIME4BYTES};
125
126// HC_HASH_LOG / HC_CHAIN_LOG / HC3_HASH_LOG / HC_EMPTY live on the
127// shared storage module so MatchTable methods can reference them
128// without pulling in this module. Re-imported here so existing
129// macros / configs / tests keep their unqualified names.
130#[cfg(test)]
131use super::match_table::storage::HC_EMPTY;
132use super::match_table::storage::HC3_HASH_LOG;
133// HC_HASH_LOG / HC_CHAIN_LOG feed the test-only `HC_CONFIG` default.
134#[cfg(test)]
135use super::match_table::storage::{HC_CHAIN_LOG, HC_HASH_LOG};
136// HC3_MAX_OFFSET moved to encoding::bt alongside the hash3 candidate
137// probe macro that consumes it; the macro references it via the
138// fully-qualified `$crate::encoding::bt::HC3_MAX_OFFSET` path so this
139// module no longer needs a local import.
140const HC_SEARCH_DEPTH: usize = 16;
141// HC_MIN_MATCH_LEN moved to encoding::hc; re-imported here so
142// existing references compile unchanged.
143use super::hc::HC_MIN_MATCH_LEN;
144const HC_OPT_MIN_MATCH_LEN: usize = HC_FORMAT_MINMATCH;
145const HC_TARGET_LEN: usize = 48;
146
147// MAX_HC_SEARCH_DEPTH moved to encoding::hc alongside chain_candidates.
148use super::hc::MAX_HC_SEARCH_DEPTH;
149
150// `Strategy` and `StrategyTag` live in `crate::encoding::strategy`.
151// The driver carries a `StrategyTag` field set at `reset()` and
152// dispatches each block into a monomorphised `compress_block::<S>`
153// per concrete strategy.
154
155/// Bundled tuning knobs for the hash-chain matcher. Using a typed config
156/// instead of positional `usize` args eliminates parameter-order hazards.
157#[derive(Copy, Clone, PartialEq, Eq)]
158struct HcConfig {
159    hash_log: usize,
160    chain_log: usize,
161    search_depth: usize,
162    target_len: usize,
163    /// Binary-tree finder hash width (upstream zstd `mls = BOUNDED(4, minMatch, 6)`),
164    /// carried explicitly per level so it is NOT inferred from `target_len`
165    /// (a `target_length` override must not silently flip the finder between
166    /// 5- and 4-byte hashing). Only the BT body reads it; HC/lazy levels keep
167    /// it at 4 (their `hash_position` is always 4-byte). 5 for the
168    /// minMatch=5 BT levels (btlazy2 + btopt L16), 4 elsewhere.
169    search_mls: usize,
170}
171
172#[derive(Copy, Clone, PartialEq, Eq)]
173pub(crate) struct RowConfig {
174    pub(crate) hash_bits: usize,
175    pub(crate) row_log: usize,
176    pub(crate) search_depth: usize,
177    pub(crate) target_len: usize,
178    /// Upstream zstd `cParams.minMatch` for the row matcher: the regular-search
179    /// acceptance floor (a row candidate must extend to >= `mls` bytes).
180    /// The C-like advanced API surfaces this as the row min-match knob.
181    /// `ROW_MIN_MATCH_LEN` (5) is the default; the row hash key width stays
182    /// 4 bytes (an internal detail), so this only tunes the acceptance
183    /// floor, not the candidate hash distribution.
184    pub(crate) mls: usize,
185}
186
187// Only used as the default HashChain config when the test-only parse×search
188// override pairs a level with a backend its native row doesn't populate.
189#[cfg(test)]
190const HC_CONFIG: HcConfig = HcConfig {
191    hash_log: HC_HASH_LOG,
192    chain_log: HC_CHAIN_LOG,
193    search_depth: HC_SEARCH_DEPTH,
194    target_len: HC_TARGET_LEN,
195    search_mls: 4,
196};
197
198/// Base HashChain config synthesized when a public-parameter strategy
199/// override ([`super::parameters`]) routes a level to the HC / BT
200/// backend whose native level row didn't populate `hc` (e.g. forcing
201/// `Strategy::Lazy2` onto a level the table resolves to Fast). Mirrors
202/// the mid-band lazy defaults; the per-knob overrides then refine it.
203const HC_OVERRIDE_DEFAULT: HcConfig = HcConfig {
204    hash_log: super::match_table::storage::HC_HASH_LOG,
205    chain_log: super::match_table::storage::HC_CHAIN_LOG,
206    search_depth: HC_SEARCH_DEPTH,
207    target_len: HC_TARGET_LEN,
208    search_mls: 4,
209};
210
211const BTULTRA2_HC_CONFIG: HcConfig = HcConfig {
212    hash_log: 24,
213    chain_log: 24,
214    search_depth: 512,
215    target_len: 256,
216    search_mls: 4,
217};
218
219const BTULTRA2_HC_CONFIG_L22: HcConfig = HcConfig {
220    hash_log: 25,
221    chain_log: 27,
222    search_depth: 512,
223    target_len: 999,
224    search_mls: 4,
225};
226
227const BTULTRA2_HC_CONFIG_L22_256K: HcConfig = HcConfig {
228    hash_log: 19,
229    chain_log: 19,
230    search_depth: 1 << 13,
231    target_len: 999,
232    search_mls: 4,
233};
234
235const BTULTRA2_HC_CONFIG_L22_128K: HcConfig = HcConfig {
236    hash_log: 17,
237    chain_log: 18,
238    search_depth: 1 << 11,
239    target_len: 999,
240    search_mls: 4,
241};
242
243const BTULTRA2_HC_CONFIG_L22_16K: HcConfig = HcConfig {
244    hash_log: 15,
245    chain_log: 15,
246    search_depth: 1 << 10,
247    target_len: 999,
248    search_mls: 4,
249};
250
251// Default Row config: only used by tests and the test-only parse×search
252// override (production greedy L5 carries its own `ROW_L5`).
253#[cfg(test)]
254const ROW_CONFIG: RowConfig = RowConfig {
255    hash_bits: ROW_HASH_BITS,
256    row_log: ROW_LOG,
257    search_depth: ROW_SEARCH_DEPTH,
258    target_len: ROW_TARGET_LEN,
259    mls: ROW_MIN_MATCH_LEN,
260};
261
262// Level-5 greedy is the ONLY strategy routed to the Row backend
263// (`StrategyTag::backend`: greedy -> Row; lazy / btopt / btultra* ->
264// HashChain), so it is the only level whose `row:` field is read. The upstream zstd
265// `clevels.h` default row (srcSize > 256 KB) for level 5 is searchLog=3,
266// targetLength=2, from which the row matcher derives:
267//   rowLog       = clamp(searchLog, 4, 6) = 4
268//   search_depth = 1 << min(searchLog, rowLog) = 8   (= nbAttempts)
269//   target_len   = targetLength = 2                  (nice-match early-out)
270// The shared `ROW_CONFIG` (row_log=5, search_depth=16, target_len=48) ran a
271// level-12-grade search here: 16 slots per row, never early-exiting until a
272// 48-byte match. That exhaustive walk was the dominant cost in greedy L5's
273// encode-speed regression vs FFI. `hash_bits` matches upstream zstd's
274// `ZSTD_getCParams(5, .., 0).hashLog` = 19 (verified via
275// `cparams_check 5`), so the row table is the same width as upstream's
276// (2^19 slots); the previous `ROW_HASH_BITS` (20) doubled both row tables vs
277// upstream, the dominant peak-memory excess on the greedy band.
278const ROW_L5: RowConfig = RowConfig {
279    hash_bits: 19,
280    row_log: 4,
281    search_depth: 8,
282    target_len: 2,
283    mls: ROW_MIN_MATCH_LEN,
284};
285
286// Upstream zstd `clevels.h` unbounded defaults for the lazy band, verified via
287// `ZSTD_getCParams(level, 0, 0)`:
288//   L6  { w21 c18 h19 s3 mml5 t4  lazy  } → rowLog 4, depth 1<<3 = 8
289//   L7  { w21 c19 h20 s4 mml5 t8  lazy  } → rowLog 4, depth 16
290//   L8  { w21 c19 h20 s4 mml5 t16 lazy2 } → rowLog 4, depth 16
291//   L9  { w22 c20 h21 s4 mml5 t16 lazy2 } → rowLog 4, depth 16
292//   L10 { w22 c21 h22 s5 mml5 t16 lazy2 } → rowLog 5, depth 32
293//   L11 { w22 c21 h22 s6 mml5 t16 lazy2 } → rowLog 6, depth 64
294//   L12 { w22 c22 h23 s6 mml5 t32 lazy2 } → rowLog 6, depth 64
295// `rowLog = clamp(searchLog, 4, 6)`, `depth = 1 << min(searchLog, rowLog)`
296// (same derivation as `ROW_L5` above). `hash_bits` carries the upstream zstd
297// `hashLog`; the hinted-source clamp in `configure` caps it by the window
298// exactly like the upstream zstd `ZSTD_adjustCParams` path.
299const ROW_L6: RowConfig = RowConfig {
300    hash_bits: 19,
301    row_log: 4,
302    search_depth: 8,
303    target_len: 4,
304    mls: ROW_MIN_MATCH_LEN,
305};
306const ROW_L7: RowConfig = RowConfig {
307    hash_bits: 20,
308    row_log: 4,
309    search_depth: 16,
310    target_len: 8,
311    mls: ROW_MIN_MATCH_LEN,
312};
313const ROW_L8: RowConfig = RowConfig {
314    hash_bits: 20,
315    row_log: 4,
316    search_depth: 16,
317    target_len: 16,
318    mls: ROW_MIN_MATCH_LEN,
319};
320const ROW_L9: RowConfig = RowConfig {
321    hash_bits: 21,
322    row_log: 4,
323    search_depth: 16,
324    target_len: 16,
325    mls: ROW_MIN_MATCH_LEN,
326};
327const ROW_L10: RowConfig = RowConfig {
328    hash_bits: 22,
329    row_log: 5,
330    search_depth: 32,
331    target_len: 16,
332    mls: ROW_MIN_MATCH_LEN,
333};
334const ROW_L11: RowConfig = RowConfig {
335    hash_bits: 22,
336    row_log: 6,
337    search_depth: 64,
338    target_len: 16,
339    mls: ROW_MIN_MATCH_LEN,
340};
341const ROW_L12: RowConfig = RowConfig {
342    hash_bits: 23,
343    row_log: 6,
344    search_depth: 64,
345    target_len: 32,
346    mls: ROW_MIN_MATCH_LEN,
347};
348
349/// Per-level Double-Fast hash sizing, mirroring the upstream zstd `clevels.h` columns
350/// (config-driven, not a hardcoded constant): `long_hash_log` =
351/// `cParams.hashLog` (the long 8-byte hash table), `short_hash_log` =
352/// `cParams.chainLog` (the short hash table dfast repurposes as its
353/// secondary index). Only the Dfast backend reads it, so non-dfast level
354/// rows carry `dfast: None`. `minMatch` stays the upstream zstd-fixed `5`
355/// (`DFAST_MIN_MATCH_LEN`, used in const contexts).
356#[derive(Copy, Clone, PartialEq, Eq)]
357struct DfastConfig {
358    long_hash_log: u8,
359    short_hash_log: u8,
360}
361
362// Upstream zstd clevels.h default row (srcSize > 256 KB): L3 {hashLog 17, chainLog 16},
363// L4 {hashLog 18, chainLog 18}.
364const DFAST_L3: DfastConfig = DfastConfig {
365    long_hash_log: 17,
366    short_hash_log: 16,
367};
368const DFAST_L4: DfastConfig = DfastConfig {
369    long_hash_log: 18,
370    short_hash_log: 18,
371};
372
373/// Per-level Fast-strategy tuning, only consumed by the `FastKernelMatcher`
374/// (Simple backend): `hash_log` = upstream zstd `cParams.hashLog`, `mls` = upstream zstd
375/// `cParams.minMatch` (4..=8), `step_size` = upstream zstd `stepSize`. Carried as
376/// `LevelParams.fast` (`Some` only on Fast level rows; `None` elsewhere).
377#[derive(Copy, Clone, PartialEq, Eq)]
378struct FastConfig {
379    hash_log: u32,
380    mls: u32,
381    step_size: usize,
382}
383
384const FAST_L1: FastConfig = FastConfig {
385    hash_log: 14,
386    // Tier-0 (srcSize > 256 KiB) `cParams.minMatch`. Upstream zstd selects the
387    // Level-1 row from a 4-way srcSize-tiered table (`ZSTD_getCParams_internal`
388    // → `ZSTD_defaultCParameters[tableID][1]`), and minMatch shrinks for
389    // smaller inputs: 7 (>256 KiB) / 6 (16..256 KiB) / 5 (<=16 KiB). The base
390    // here is the tier-0 value; `fast_l1_mls_for_source_size` lowers it per the
391    // tier in `adjust_params_for_source_size`.
392    mls: 7,
393    step_size: 2,
394};
395const FAST_L2: FastConfig = FastConfig {
396    hash_log: 16,
397    mls: 6,
398    step_size: 2,
399};
400
401/// Resolved tuning parameters for a compression level. The
402/// [`StrategyTag`] is the single source of truth for the backend
403/// family and the compile-time strategy consts; the runtime
404/// [`BackendTag`] used by the driver dispatcher is derived via
405/// [`StrategyTag::backend`] so the two cannot drift.
406#[derive(Copy, Clone, PartialEq, Eq)]
407struct LevelParams {
408    strategy_tag: super::strategy::StrategyTag,
409    /// Decoupled search-method axis. Independent of `strategy_tag`'s
410    /// parse half: a level can pair any parse (greedy / lazy depth via
411    /// `lazy_depth`) with any search backend here. Defaults to the
412    /// historical pairing (`strategy_tag.search()`) but is overridable
413    /// per level so the parse×search matrix can be swept and tuned.
414    search: super::strategy::SearchMethod,
415    window_log: u8,
416    lazy_depth: u8,
417    /// Per-strategy tuning. Exactly one is `Some` on each level row, matching
418    /// `strategy_tag`'s backend, so the table self-documents which knobs a
419    /// level actually consumes (the others are `None`, not dead placeholders):
420    /// `fast` for the Fast/Simple backend, `dfast` for Double-Fast, `hc` for
421    /// the HashChain (lazy / btopt / btultra*) backend, `row` for the Row
422    /// (greedy L5) backend.
423    fast: Option<FastConfig>,
424    dfast: Option<DfastConfig>,
425    hc: Option<HcConfig>,
426    row: Option<RowConfig>,
427}
428
429impl LevelParams {
430    /// Backend family (storage variant) for the driver dispatcher.
431    /// Derived from the decoupled `search` axis so a level can route to
432    /// a different search backend than its `strategy_tag` historically
433    /// implied.
434    fn backend(&self) -> super::strategy::BackendTag {
435        self.search.backend()
436    }
437
438    /// Parse mode derived from the decoupled `search` axis: the binary-tree
439    /// search path carries `ParseMode::Optimal`; every other search backend
440    /// derives greedy/lazy/lazy2 from `lazy_depth`. Reading `search` (not the
441    /// strategy tag) keeps the parse×search decoupling complete even when a
442    /// level whose tag is `Bt*` is overridden to a non-BT search backend.
443    fn parse(&self) -> super::strategy::ParseMode {
444        match self.search {
445            super::strategy::SearchMethod::BinaryTree => super::strategy::ParseMode::Optimal,
446            _ => super::strategy::ParseMode::from_lazy_depth(self.lazy_depth),
447        }
448    }
449
450    /// Cheap fingerprint pre-splitter level, the C-like `blockSplitterLevel`
451    /// knob. Mirrors the upstream zstd `splitLevels[]` table indexed by strategy in
452    /// `ZSTD_optimalBlockSize` (`{0,0,1,2,2,3,3,4,4,4}` over fast..btultra2):
453    /// fast=0, dfast=1, greedy=2, lazy=2, lazy2=3, btlazy2=3,
454    /// btopt/btultra/btultra2=4. We collapse the upstream zstd `lazy2` and `btlazy2`
455    /// strategies into the hash-chain `Lazy` tag, distinguished here by
456    /// `lazy_depth` (the level table runs both at depth 2), so depth 2 routes
457    /// to split level 3 to match the upstream zstd. `split_level == 0` routes to the
458    /// cheap from-borders heuristic; `1..=4` to byChunks with internal
459    /// sampling level `split_level - 1`. The `savings >= 3` gate in
460    /// `optimal_block_size` keeps incompressible data and the first full block
461    /// whole, so homogeneous frames are not over-split.
462    fn pre_split(&self) -> Option<u8> {
463        match self.strategy_tag {
464            super::strategy::StrategyTag::Fast => Some(0),
465            super::strategy::StrategyTag::Dfast => Some(1),
466            super::strategy::StrategyTag::Greedy => Some(2),
467            // The lazy2 / btlazy2 band (Lazy at lazy_depth >= 2, and Btlazy2)
468            // uses the rate-1 full-scan chunk splitter (4), NOT the rate-5
469            // sampler (3). The rate-5 sampler combined with the larger
470            // hash_log is sensitive enough to register a phantom statistical
471            // transition on perfectly homogeneous but periodic input (e.g. a
472            // repeating log-line stream whose period does not divide the 8 KB
473            // chunk size): the sampled bytes land on a different phase in each
474            // chunk, so two identical-distribution chunks look different and
475            // the block is split at 8 KB, then re-split on every window,
476            // cascading a large stream into hundreds of tiny blocks whose
477            // per-block headers dwarf the payload. The rate-1 scan reads every
478            // byte, so it sees periodic data as uniform and declines to split,
479            // while still finding genuine content boundaries (measured better
480            // ratio on the real decode corpus, and no longer expands a
481            // periodic stream vs a single full block). lazy/greedy keep the
482            // coarse samplers (lower hash_log => not sensitive enough to
483            // alias here).
484            super::strategy::StrategyTag::Lazy => {
485                if self.lazy_depth >= 2 {
486                    Some(4)
487                } else {
488                    Some(2)
489                }
490            }
491            super::strategy::StrategyTag::Btlazy2 => Some(4),
492            super::strategy::StrategyTag::BtOpt
493            | super::strategy::StrategyTag::BtUltra
494            | super::strategy::StrategyTag::BtUltra2 => Some(4),
495        }
496    }
497}
498
499/// Apply the public-parameter per-knob overrides (#27) onto the
500/// level-resolved [`LevelParams`], in place. Runs in [`Matcher::reset`]
501/// after the level params are computed and before backend selection, so
502/// a strategy override re-routes the backend uniformly. An all-`None`
503/// override is a no-op the caller skips via
504/// [`super::parameters::ParamOverrides::is_empty`], keeping the default
505/// level geometry byte-identical.
506fn apply_param_overrides(params: &mut LevelParams, ov: &super::parameters::ParamOverrides) {
507    use super::strategy::SearchMethod;
508
509    // 1. Strategy override re-derives tag / search / lazy depth.
510    if let Some(strategy) = ov.strategy {
511        let tag = strategy.tag();
512        params.strategy_tag = tag;
513        params.search = tag.search();
514        params.lazy_depth = strategy.lazy_depth();
515    }
516
517    // 2. Ensure the active backend's config row exists (synthesize a
518    //    default when a strategy override moved off the native row).
519    match params.search {
520        SearchMethod::Fast => {
521            params.fast.get_or_insert(FAST_L1);
522        }
523        SearchMethod::DoubleFast => {
524            params.dfast.get_or_insert(DFAST_L3);
525        }
526        SearchMethod::RowHash => {
527            params.row.get_or_insert(ROW_L5);
528        }
529        SearchMethod::HashChain | SearchMethod::BinaryTree => {
530            // A `Btlazy2` strategy override moved off a non-HC row needs the
531            // BT 5-byte finder hash (upstream zstd minMatch 5); other synthesized HC
532            // rows keep the 4-byte default. An explicit `min_match` override
533            // below refines this further.
534            params.hc.get_or_insert(HcConfig {
535                search_mls: if matches!(params.strategy_tag, super::strategy::StrategyTag::Btlazy2)
536                {
537                    5
538                } else {
539                    HC_OVERRIDE_DEFAULT.search_mls
540                },
541                ..HC_OVERRIDE_DEFAULT
542            });
543        }
544    }
545
546    // 3. window_log (bounds-checked at <= 30 by the builder).
547    if let Some(window_log) = ov.window_log {
548        params.window_log = window_log;
549    }
550
551    // 4. Per-backend numeric knobs map into the active config, mirroring
552    //    the upstream zstd `cParams` -> matcher translation documented on each
553    //    config struct.
554    match params.search {
555        SearchMethod::Fast => {
556            if let Some(fast) = params.fast.as_mut() {
557                if let Some(hash_log) = ov.hash_log {
558                    fast.hash_log = hash_log;
559                }
560                if let Some(min_match) = ov.min_match {
561                    fast.mls = min_match;
562                }
563            }
564        }
565        SearchMethod::DoubleFast => {
566            if let Some(dfast) = params.dfast.as_mut() {
567                // hashLog -> long table, chainLog -> short table (the
568                // dfast secondary index). Both bounds-checked <= 30, so
569                // the `u8` casts are lossless.
570                if let Some(hash_log) = ov.hash_log {
571                    dfast.long_hash_log = hash_log as u8;
572                }
573                if let Some(chain_log) = ov.chain_log {
574                    dfast.short_hash_log = chain_log as u8;
575                }
576            }
577        }
578        SearchMethod::RowHash => {
579            if let Some(row) = params.row.as_mut() {
580                // Row hash-table width override (mirrors dfast `long_hash_log`
581                // / hc `hash_log`). Row has no separate chain table — the
582                // per-row depth comes from `search_log` below — so only
583                // `hash_log` maps here; `chain_log` has no Row analogue.
584                if let Some(hash_log) = ov.hash_log {
585                    row.hash_bits = hash_log as usize;
586                }
587                if let Some(search_log) = ov.search_log {
588                    // Upstream zstd: rowLog = clamp(searchLog, 4, 6);
589                    //        nbAttempts = 1 << min(searchLog, rowLog).
590                    let row_log = (search_log as usize).clamp(4, 6);
591                    row.row_log = row_log;
592                    row.search_depth = 1usize << (search_log as usize).min(row_log);
593                }
594                if let Some(target_length) = ov.target_length {
595                    row.target_len = target_length as usize;
596                }
597                if let Some(min_match) = ov.min_match {
598                    row.mls = min_match as usize;
599                }
600            }
601        }
602        SearchMethod::HashChain | SearchMethod::BinaryTree => {
603            if let Some(hc) = params.hc.as_mut() {
604                if let Some(hash_log) = ov.hash_log {
605                    hc.hash_log = hash_log as usize;
606                }
607                if let Some(chain_log) = ov.chain_log {
608                    hc.chain_log = chain_log as usize;
609                }
610                if let Some(search_log) = ov.search_log {
611                    hc.search_depth = 1usize << search_log;
612                }
613                if let Some(target_length) = ov.target_length {
614                    hc.target_len = target_length as usize;
615                }
616                if let Some(min_match) = ov.min_match {
617                    // Upstream zstd `mls = BOUNDED(4, cParams.minMatch, 6)`: a BT
618                    // min_match override maps into the finder hash width. Only
619                    // the BT body reads `search_mls`; HC/lazy keep 4-byte
620                    // hashing regardless, so this is a no-op for them.
621                    hc.search_mls = (min_match as usize).clamp(4, 6);
622                }
623            }
624        }
625    }
626}
627
628/// Map the resolved runtime strategy to the upstream zstd LDM strategy ordinal
629/// (1..=9) that [`super::ldm::params::LdmParams::adjust_for`] expects.
630/// The collapsed `Lazy` tag splits on `lazy_depth` (lazy = 4, lazy2 = 5).
631#[cfg(feature = "hash")]
632fn ldm_strategy_ordinal(tag: super::strategy::StrategyTag, lazy_depth: u8) -> u32 {
633    use super::strategy::StrategyTag;
634    match tag {
635        StrategyTag::Fast => 1,
636        StrategyTag::Dfast => 2,
637        StrategyTag::Greedy => 3,
638        StrategyTag::Lazy => {
639            if lazy_depth >= 2 {
640                5
641            } else {
642                4
643            }
644        }
645        // Upstream zstd `ZSTD_btlazy2` ordinal.
646        StrategyTag::Btlazy2 => 6,
647        StrategyTag::BtOpt => 7,
648        StrategyTag::BtUltra => 8,
649        StrategyTag::BtUltra2 => 9,
650    }
651}
652
653/// `ceil(log2(size))` of a source-size hint, with a zero hint floored to
654/// [`MIN_WINDOW_LOG`]. This is the single quantization every hint-dependent
655/// matcher parameter is derived from: the window-log cap, the HC / Fast hash
656/// and chain widths, the Dfast / Row table widths, the L22 config buckets, and
657/// the Fast attach-vs-copy cutoff. Two hints sharing this value resolve to the
658/// identical matcher shape, which is why it (not the raw byte count) keys the
659/// primed-dictionary snapshot — see [`PrimedKey`]. Operates on the full `u64`
660/// so callers comparing a hint against a cutoff get the same bucketed decision
661/// here and at the driver, with no `as usize` truncation on 32-bit targets.
662pub(crate) fn source_size_ceil_log(size: u64) -> u8 {
663    if size == 0 {
664        MIN_WINDOW_LOG
665    } else {
666        (64 - (size - 1).leading_zeros()) as u8
667    }
668}
669
670/// Attach-vs-copy cutoff for the Fast strategy, as a ceil-log bucket: a hint at
671/// or below `2^this` (or unknown, `None`) ATTACHES the dictionary (a separate
672/// immutable table scanned in place via the borrowed dual-base kernel); a larger
673/// hint would COPY it into the live table.
674///
675/// We set this to `31` so every dictionary source up to 2 GiB attaches,
676/// diverging from upstream zstd's 8 KiB `ZSTD_shouldAttachDict` cutoff ON
677/// PURPOSE: upstream copy mode copies the small CDict TABLES into the cctx and
678/// still scans the input in place, but our flat-history copy path memmoves the
679/// whole INPUT into history every frame (profiled at 30% `__memmove` + 14%
680/// `__memset` on a reused 1 MiB dict encode). Attach mode scans the caller's
681/// input in place with the dict as a separate prefix base, so it is strictly
682/// faster for every frame size here (measured: 1 MiB dict frame 167 us -> 52 us,
683/// 0.42x of C; 10 KiB 20.4 us -> 4.4 us, 0.17x of C). The dual-base kernel
684/// carries `window_low`, so over-window inputs stay in-window and C-decodable.
685///
686/// `31` is also the largest bucket the borrowed kernel can attach: it stores
687/// virtual positions as `u32` (`cur_abs as u32`), so the maximum attached source
688/// `1 << 31` (plus the dict prefix) stays below `u32::MAX`; the next bucket `32`
689/// (4 GiB) would wrap that arithmetic. Sources past 2 GiB therefore fall back to
690/// copy mode — rare in practice, and the relative copy cost shrinks as the
691/// source grows. Per the drop-in-not-binary-parity contract, we make this match
692/// decision ourselves.
693/// Shared by `reset` (records the mode in the primed-snapshot key) and
694/// `prime_with_dictionary` (acts on it).
695pub(crate) const FAST_ATTACH_DICT_CUTOFF_LOG: u8 = 31;
696
697/// Largest dictionary region (bytes) the Fast attach path can index. The tagged
698/// dict table packs each position into `32 - DICT_TAG_BITS` (= 24) bits, so a
699/// region past `2^24` (16 MiB) would overflow the packed position. Dictionaries
700/// this large fall back to COPY mode, whose live table stores full `u32`
701/// positions and handles them. The size hint set on dict load equals the actual
702/// dict content length, so the attach-vs-copy decision (and the matching
703/// snapshot-key / epoch bits) can gate on it consistently at reset time.
704pub(crate) const MAX_FAST_ATTACH_DICT_REGION: usize = 1 << 24;
705
706/// Dfast counterpart of [`FAST_ATTACH_DICT_CUTOFF_LOG`]: upstream zstd
707/// `ZSTD_dictMatchState` attach cutoff for the double-fast strategy is 16 KiB
708/// (`2^14`), so small / unknown-size inputs ATTACH (separate immutable dict
709/// long+short tables + dual-probe in `start_matching_fast_loop`) and larger
710/// known-size inputs COPY (re-prime the dict into the live tables, where the
711/// dense scan matches it as window history). The attach build also self-gates
712/// on `use_fast_loop` inside `skip_matching_for_dict_attach` — only the
713/// fast-loop levels (L3 / Default / L0) carry the dual-probe.
714const DFAST_ATTACH_DICT_CUTOFF_LOG: u8 = 14;
715
716/// `ZSTD_dictMatchState` attach cutoff for the Row (greedy/lazy) strategy is
717/// 32 KiB (`2^15`, upstream zstd `attachDictSizeCutoffs`): small / unknown-size inputs
718/// ATTACH the dict into the separate immutable row index (bounded dual-probe in
719/// `row_candidate_rl`), larger known-size inputs dense-COPY into the live rows.
720const ROW_ATTACH_DICT_CUTOFF_LOG: u8 = 15;
721
722/// 32 KiB (`2^15`, upstream zstd `attachDictSizeCutoffs[ZSTD_lazy2]`): small /
723/// unknown-size inputs ATTACH the dict as a separate hash-chain dms (the dual
724/// search in `find_best_match` walks the live input chain + the dms), larger
725/// known-size inputs dense-COPY (merge the dict into the live chain and search
726/// the one combined chain).
727const HC_ATTACH_DICT_CUTOFF_LOG: u8 = 15;
728
729/// BT/optimal attach cutoff for `btlazy2` + `btopt`: 32 KiB (`2^15`, upstream
730/// zstd `attachDictSizeCutoffs[ZSTD_btlazy2]` == `[ZSTD_btopt]`). Small /
731/// unknown-size inputs ATTACH the dict as a separate DUBT dms; larger known-size
732/// inputs COPY the dict into the LIVE binary tree (upstream zstd
733/// `ZSTD_resetCCtx_byCopyingCDict`).
734const BT_OPT_ATTACH_DICT_CUTOFF_LOG: u8 = 15;
735
736/// BT/optimal attach cutoff for `btultra` + `btultra2`: 8 KiB (`2^13`, upstream
737/// zstd `attachDictSizeCutoffs[ZSTD_btultra]` == `[ZSTD_btultra2]`). The deepest
738/// parses copy the dict into the live tree past a much smaller source than the
739/// `btopt` tier, matching upstream's per-strategy cutoff table.
740const BT_ULTRA_ATTACH_DICT_CUTOFF_LOG: u8 = 13;
741
742// Source-size cap for the dfast hash bits when a size hint is present: a tiny
743// input needs no larger hash than its window. The upstream zstd `cParams.hashLog` /
744// `chainLog` (from `DfastConfig`) caps it from above at the call site.
745fn dfast_hash_bits_for_window(max_window_size: usize) -> usize {
746    let window_log = (usize::BITS - 1 - max_window_size.leading_zeros()) as usize;
747    window_log.max(MIN_WINDOW_LOG as usize)
748}
749
750fn row_hash_bits_for_window(max_window_size: usize) -> usize {
751    // Upstream zstd `ZSTD_adjustCParams_internal` cap: `hashLog <= windowLog + 1`.
752    // The `+ 1` is load-bearing for L12, whose upstream zstd hashLog (23) exceeds
753    // its windowLog (22) — a plain `windowLog` cap would shrink the L12
754    // table on EVERY hinted reset and split primed snapshots between
755    // hinted and unhinted frames that resolve to the identical geometry.
756    // No constant upper clamp: the old `ROW_HASH_BITS` (20) ceiling
757    // predates the lazy band moving onto Row (L9-12 carry upstream zstd hashLog
758    // 21-23).
759    let window_log = (usize::BITS - 1 - max_window_size.leading_zeros()) as usize;
760    (window_log + 1).max(MIN_WINDOW_LOG as usize)
761}
762
763/// `floor(log2(window))` for the HashChain table-log cap (upstream zstd
764/// `ZSTD_adjustCParams_internal`). The caller clamps the level's `hash_log` /
765/// `chain_log` from above with this so a small hinted input doesn't allocate the
766/// full level's tables.
767fn hc_hash_bits_for_window(max_window_size: usize) -> usize {
768    let window_log = (usize::BITS - 1 - max_window_size.leading_zeros()) as usize;
769    window_log.max(MIN_WINDOW_LOG as usize)
770}
771
772/// Parameter table for numeric compression levels 1–22.
773///
774/// Each entry maps a zstd compression level to the best-available matcher
775/// backend and tuning knobs. High levels map to dedicated parse modes:
776/// btopt (16-17), btultra (18), btultra2 (19-22) — matching upstream zstd
777/// `clevels.h` (level 19 is `ZSTD_btultra2`, not plain btultra).
778///
779/// Index 0 = level 1, index 21 = level 22.
780#[rustfmt::skip]
781const LEVEL_TABLE: [LevelParams; 22] = [
782    // Exactly one of fast/dfast/hc/row is Some per row, matching the strategy
783    // backend; the rest are None (not dead placeholders).
784    // Lvl  Strategy       wlog  lazy  per-strategy config
785    // ---  -------------- ----  ----  -------------------
786    /* 1 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Fast, search: super::strategy::SearchMethod::Fast, window_log: 19, lazy_depth: 0, fast: Some(FAST_L1), dfast: None, hc: None, row: None },
787    /* 2 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Fast, search: super::strategy::SearchMethod::Fast, window_log: 20, lazy_depth: 0, fast: Some(FAST_L2), dfast: None, hc: None, row: None },
788    /* 3 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Dfast, search: super::strategy::SearchMethod::DoubleFast, window_log: 21, lazy_depth: 1, fast: None, dfast: Some(DFAST_L3), hc: None, row: None },
789    /* 4 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Dfast, search: super::strategy::SearchMethod::DoubleFast, window_log: 21, lazy_depth: 1, fast: None, dfast: Some(DFAST_L4), hc: None, row: None },
790    // target_len column for L5..=L15 matches upstream zstd cParams.targetLength
791    // from clevels.h table[0] (default — srcSize > 256 KB). Upstream zstd uses
792    // it as the lazy outer loop's `sufficient_len` (nice-match) threshold.
793    // Inflating it above upstream zstd forces the chain walk to complete
794    // search_depth iterations instead of breaking on the first
795    // long-enough match — the dominant cost in the L5..=L15 speed
796    // regression vs FFI (see lazy_band_target_len_matches_default_table).
797    /* 5 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Greedy, search: super::strategy::SearchMethod::RowHash, window_log: 21, lazy_depth: 0, fast: None, dfast: None, hc: None, row: Some(ROW_L5) },
798    // L6-12: the upstream zstd runs the lazy/lazy2 strategies on the ROW-based
799    // match finder by default (`ZSTD_resolveRowMatchFinderMode`: row mode
800    // is on for greedy..lazy2 whenever SIMD is available) — a bounded
801    // SIMD tag scan per row instead of a pointer-chasing hash-chain walk.
802    // Our HashChain walk on these levels was ~75% of L10 wall time on the
803    // 1 MiB corpus (dependent chain-table loads). Same `RowConfig`
804    // derivation as `ROW_L5` above, upstream zstd values per level in the
805    // `ROW_L6..ROW_L12` comment block.
806    /* 6 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Lazy, search: super::strategy::SearchMethod::RowHash, window_log: 21, lazy_depth: 1, fast: None, dfast: None, hc: None, row: Some(ROW_L6) },
807    /* 7 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Lazy, search: super::strategy::SearchMethod::RowHash, window_log: 21, lazy_depth: 1, fast: None, dfast: None, hc: None, row: Some(ROW_L7) },
808    /* 8 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Lazy, search: super::strategy::SearchMethod::RowHash, window_log: 21, lazy_depth: 2, fast: None, dfast: None, hc: None, row: Some(ROW_L8) },
809    /* 9 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Lazy, search: super::strategy::SearchMethod::RowHash, window_log: 22, lazy_depth: 2, fast: None, dfast: None, hc: None, row: Some(ROW_L9) },
810    /*10 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Lazy, search: super::strategy::SearchMethod::RowHash, window_log: 22, lazy_depth: 2, fast: None, dfast: None, hc: None, row: Some(ROW_L10) },
811    /*11 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Lazy, search: super::strategy::SearchMethod::RowHash, window_log: 22, lazy_depth: 2, fast: None, dfast: None, hc: None, row: Some(ROW_L11) },
812    /*12 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Lazy, search: super::strategy::SearchMethod::RowHash, window_log: 22, lazy_depth: 2, fast: None, dfast: None, hc: None, row: Some(ROW_L12) },
813    // L13-15: reference uses btlazy2 (binary-tree finder) with searchLog 4/5/6
814    // (search_depth 16/32/64) and targetLength 32. We run the hash-chain Lazy
815    // parser here, so we mirror the reference search budget rather than inflate
816    // it: matching the table keeps speed near the reference and makes per-level
817    // perf divergences comparable. The binary-tree finder that would let a
818    // smaller searchLog find longer matches (and re-establish a strict ratio
819    // ladder above L12) is tracked separately; until it lands these levels sit
820    // close to L12 on hash-chain inputs by design.
821    /*13 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Btlazy2, search: super::strategy::SearchMethod::BinaryTree, window_log: 22, lazy_depth: 2, fast: None, dfast: None, hc: Some(HcConfig { hash_log: 22, chain_log: 22, search_depth: 16, target_len: 32, search_mls: 5 }), row: None },
822    /*14 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Btlazy2, search: super::strategy::SearchMethod::BinaryTree, window_log: 22, lazy_depth: 2, fast: None, dfast: None, hc: Some(HcConfig { hash_log: 23, chain_log: 22, search_depth: 32, target_len: 32, search_mls: 5 }), row: None },
823    /*15 */ LevelParams { strategy_tag: super::strategy::StrategyTag::Btlazy2, search: super::strategy::SearchMethod::BinaryTree, window_log: 22, lazy_depth: 2, fast: None, dfast: None, hc: Some(HcConfig { hash_log: 23, chain_log: 23, search_depth: 64, target_len: 32, search_mls: 5 }), row: None },
824    /*16 */ LevelParams { strategy_tag: super::strategy::StrategyTag::BtOpt, search: super::strategy::SearchMethod::BinaryTree, window_log: 22, lazy_depth: 2, fast: None, dfast: None, hc: Some(HcConfig { hash_log: 22, chain_log: 22, search_depth: 32, target_len: 48, search_mls: 5 }), row: None },
825    /*17 */ LevelParams { strategy_tag: super::strategy::StrategyTag::BtOpt, search: super::strategy::SearchMethod::BinaryTree, window_log: 23, lazy_depth: 2, fast: None, dfast: None, hc: Some(HcConfig { hash_log: 22, chain_log: 23, search_depth: 32, target_len: 64, search_mls: 4 }), row: None },
826    /*18 */ LevelParams { strategy_tag: super::strategy::StrategyTag::BtUltra, search: super::strategy::SearchMethod::BinaryTree, window_log: 23, lazy_depth: 2, fast: None, dfast: None, hc: Some(HcConfig { hash_log: 22, chain_log: 23, search_depth: 64, target_len: 64, search_mls: 4 }), row: None },
827    /*19 */ LevelParams { strategy_tag: super::strategy::StrategyTag::BtUltra2, search: super::strategy::SearchMethod::BinaryTree, window_log: 23, lazy_depth: 2, fast: None, dfast: None, hc: Some(HcConfig { hash_log: 22, chain_log: 24, search_depth: 128, target_len: 256, search_mls: 4 }), row: None },
828    /*20 */ LevelParams { strategy_tag: super::strategy::StrategyTag::BtUltra2, search: super::strategy::SearchMethod::BinaryTree, window_log: 25, lazy_depth: 2, fast: None, dfast: None, hc: Some(HcConfig { hash_log: 23, chain_log: 25, search_depth: 128, target_len: 256, search_mls: 4 }), row: None },
829    /*21 */ LevelParams { strategy_tag: super::strategy::StrategyTag::BtUltra2, search: super::strategy::SearchMethod::BinaryTree, window_log: 26, lazy_depth: 2, fast: None, dfast: None, hc: Some(BTULTRA2_HC_CONFIG), row: None },
830    /*22 */ LevelParams { strategy_tag: super::strategy::StrategyTag::BtUltra2, search: super::strategy::SearchMethod::BinaryTree, window_log: 27, lazy_depth: 2, fast: None, dfast: None, hc: Some(BTULTRA2_HC_CONFIG_L22), row: None },
831];
832
833/// Upstream `ZSTD_createCDict` table geometry: the `(hash_log, chain_log)` a
834/// dictionary's prepared match-finder tables get. Thin adapter over the single
835/// cParams source [`super::cparams::create_cdict_table_logs`], which mirrors
836/// `ZSTD_adjustCParams_internal` under `ZSTD_cpm_createCDict`. `window_log` is
837/// the resolved compress window; `hash_log` / `chain_log` are the level's own
838/// widths; `uses_bt` selects the binary-tree `cycleLog` (`chainLog - 1`).
839fn cdict_table_logs(
840    window_log: u8,
841    hash_log: usize,
842    chain_log: usize,
843    uses_bt: bool,
844    dict_size: usize,
845) -> (usize, usize) {
846    let (h, c) = super::cparams::create_cdict_table_logs(
847        window_log,
848        hash_log as u32,
849        chain_log as u32,
850        uses_bt,
851        dict_size,
852    );
853    (h as usize, c as usize)
854}
855
856/// Smallest window_log the encoder will use regardless of source size.
857pub(crate) const MIN_WINDOW_LOG: u8 = 10;
858/// Conservative floor for source-size-hinted window tuning.
859///
860/// Hinted windows below 16 KiB (`window_log < 14`) currently regress C-FFI
861/// interoperability on certain compressed-block patterns. Keep hinted
862/// windows at 16 KiB or larger until that compatibility gap is closed.
863const MIN_HINTED_WINDOW_LOG: u8 = 14;
864
865/// Adjust level parameters for a known source size.
866///
867/// This derives a cap from `ceil(log2(src_size))`, then clamps it to
868/// [`MIN_HINTED_WINDOW_LOG`] (16 KiB). A zero-byte size hint is treated as
869/// [`MIN_WINDOW_LOG`] for the raw ceil-log step and then promoted to the hinted
870/// floor. This keeps tables bounded for small inputs while preserving the
871/// encoder's baseline minimum supported window.
872/// For the HC backend, `hash_log` and `chain_log` are reduced
873/// proportionally.
874/// Source-size tier index, matching upstream `ZSTD_getCParams_internal`'s
875/// `tableID = (rSize<=256K)+(rSize<=128K)+(rSize<=16K)`: 0 = > 256 KiB or
876/// unknown, 1 = 128..256 KiB, 2 = 16..128 KiB, 3 = <= 16 KiB.
877fn cparams_tier(source_size: Option<u64>) -> usize {
878    match source_size {
879        Some(size) if size <= 16 * 1024 => 3,
880        Some(size) if size <= 128 * 1024 => 2,
881        Some(size) if size <= 256 * 1024 => 1,
882        _ => 0,
883    }
884}
885
886/// Override a Fast (L1/L2) or Dfast (L3) level row's table-shaping cParams
887/// (hashLog / chainLog / minMatch) by source-size tier, matching the
888/// reference `ZSTD_defaultCParameters[tableID][level]`. L1 keeps its base
889/// hashLog (the source-size window clamp in `adjust_params_for_source_size`
890/// already lands on the reference value) and only tiers minMatch; L2 also
891/// tiers hashLog (the tier-0 value 16 oversized the table on medium inputs,
892/// the page-fault pathology); L3 tiers both dfast hash widths. Strategy
893/// switches (L2 tier 1, L4) are intentionally not applied here.
894fn apply_cparams_tier(level: i32, source_size: Option<u64>, p: &mut LevelParams) {
895    let tier = cparams_tier(source_size);
896    // Single source for the table data: the verbatim upstream
897    // `ZSTD_defaultCParameters[tier][level]` row (`cparams::default_cparams`).
898    // The encoder consumes only the table-shaping widths here; the window /
899    // `table_log` clamp lives in `adjust_params_for_source_size`.
900    match level {
901        // Fast, all tiers — minMatch only (hashLog handled by the window clamp).
902        1 => {
903            if let Some(f) = p.fast.as_mut() {
904                f.mls = super::cparams::default_cparams(tier, 1).min_match;
905            }
906        }
907        // Fast (base strategy; tier 1 is dfast upstream — not switched here).
908        2 => {
909            if let Some(f) = p.fast.as_mut() {
910                let cp = super::cparams::default_cparams(tier, 2);
911                f.hash_log = cp.hash_log;
912                f.mls = cp.min_match;
913            }
914        }
915        // Dfast, all tiers — long hashLog (`hash_log`) + short chainLog (`chain_log`).
916        3 => {
917            if let Some(d) = p.dfast.as_mut() {
918                let cp = super::cparams::default_cparams(tier, 3);
919                d.long_hash_log = cp.hash_log as u8;
920                d.short_hash_log = cp.chain_log as u8;
921            }
922        }
923        _ => {}
924    }
925}
926
927fn adjust_params_for_source_size(mut params: LevelParams, src_size: u64) -> LevelParams {
928    // Derive a source-size-based cap from ceil(log2(src_size)), then
929    // clamp first to MIN_WINDOW_LOG (baseline encoder minimum) and then to
930    // MIN_HINTED_WINDOW_LOG (16 KiB hinted floor). For tiny or zero hints we
931    // therefore keep a 16 KiB effective minimum window in hinted mode.
932    // Raw ceil(log2(src_size)) drives the internal table sizes. The
933    // advertised `window_log` is separately floored at MIN_HINTED_WINDOW_LOG
934    // (a decoder-interop requirement on the wire format), but the hash /
935    // chain table widths are internal and never appear in the frame, so they
936    // can track the actual source size below that floor.
937    let raw_src_log = source_size_ceil_log(src_size);
938    let src_log = raw_src_log.max(MIN_WINDOW_LOG).max(MIN_HINTED_WINDOW_LOG);
939    if src_log < params.window_log {
940        params.window_log = src_log;
941    }
942    // Internal match-finder tables are sized from `table_log` — the RAW
943    // source log (floored only at the baseline `MIN_WINDOW_LOG`), NOT the
944    // wire `window_log` floor. The table widths never appear in the frame, so
945    // for small inputs they can track the actual source size and avoid
946    // zeroing a window-sized table per frame; large inputs keep the level's
947    // widths. The cap is applied with the same per-backend headroom the
948    // level table uses, so the load factor (and match quality) is unchanged.
949    // The Dfast backend derives its table widths from the source in `reset`
950    // (`set_hash_bits` recomputes there), so it is not adjusted here. The Row
951    // backend's width IS capped here, mirroring the upstream zstd (see the Row branch).
952    let table_log = raw_src_log.max(MIN_WINDOW_LOG);
953    let backend = params.backend();
954    if backend == super::strategy::BackendTag::HashChain {
955        let hc = params
956            .hc
957            .as_mut()
958            .expect("HashChain level row carries an HcConfig");
959        if (table_log + 2) < hc.hash_log as u8 {
960            hc.hash_log = (table_log + 2) as usize;
961        }
962        if (table_log + 1) < hc.chain_log as u8 {
963            hc.chain_log = (table_log + 1) as usize;
964        }
965    } else if backend == super::strategy::BackendTag::Row {
966        let row = params
967            .row
968            .as_mut()
969            .expect("Row level row carries a RowConfig");
970        // Upstream zstd `ZSTD_adjustCParams_internal` (zstd_compress.c): once
971        // the window is source-capped, `hashLog <= windowLog + 1`. The row
972        // table is `2^hash_bits` slots, exactly upstream's row hashTable
973        // `2^hashLog` slots, so the same cap applies. Without it the row table
974        // stays at the level's unbounded width (e.g. L12 hash_bits 23 = 4x
975        // upstream's source-capped 21), the dominant peak-memory excess on the
976        // row band.
977        let row_cap = (table_log + 1) as usize;
978        if row_cap < row.hash_bits {
979            row.hash_bits = row_cap;
980        }
981    } else if backend == super::strategy::BackendTag::Simple {
982        let fast = params
983            .fast
984            .as_mut()
985            .expect("Fast level row carries a FastConfig");
986        let fast_cap = (table_log + 1) as u32;
987        if fast_cap < fast.hash_log {
988            fast.hash_log = fast_cap;
989        }
990    }
991    params
992}
993
994fn level22_btultra2_params_for_source_size(source_size: Option<u64>) -> LevelParams {
995    let mut hc = match source_size {
996        Some(size) if size <= 16 * 1024 => BTULTRA2_HC_CONFIG_L22_16K,
997        Some(size) if size <= 128 * 1024 => BTULTRA2_HC_CONFIG_L22_128K,
998        Some(size) if size <= 256 * 1024 => BTULTRA2_HC_CONFIG_L22_256K,
999        _ => BTULTRA2_HC_CONFIG_L22,
1000    };
1001    let mut window_log = match source_size {
1002        Some(size) if size <= 16 * 1024 => 14,
1003        Some(size) if size <= 128 * 1024 => 17,
1004        Some(size) if size <= 256 * 1024 => 18,
1005        _ => 27,
1006    };
1007    if let Some(size) = source_size
1008        && size > 256 * 1024
1009    {
1010        let src_log = source_size_ceil_log(size);
1011        window_log = window_log.min(src_log.max(MIN_WINDOW_LOG));
1012        let adjusted_table_log = window_log as usize + 1;
1013        hc.hash_log = hc.hash_log.min(adjusted_table_log);
1014        hc.chain_log = hc.chain_log.min(adjusted_table_log);
1015    }
1016    LevelParams {
1017        strategy_tag: super::strategy::StrategyTag::BtUltra2,
1018        search: super::strategy::SearchMethod::BinaryTree,
1019        window_log,
1020        lazy_depth: 2,
1021        fast: None,
1022        dfast: None,
1023        hc: Some(hc),
1024        row: None,
1025    }
1026}
1027
1028/// Estimated steady-state heap footprint of a one-shot compression context
1029/// at `level` (window history + match-finder tables + block staging), in
1030/// bytes. Computed from the same per-level tuning table the encoder
1031/// resolves at frame start, so the estimate tracks the real allocations;
1032/// it is an upper-bound style budget figure, not an exact accounting.
1033pub fn estimated_compression_workspace_bytes(level: CompressionLevel) -> usize {
1034    use super::strategy::StrategyTag;
1035    let params = resolve_level_params(level, None);
1036    let window = 1usize << params.window_log;
1037    // Mirror `configure()`: the HC3 short-match side table exists only on
1038    // the btultra/btultra2 tags (minMatch 3), capped by the window log; the
1039    // BT pointer-pair layout fits inside the `4 << chain_log` chain term
1040    // (pairs over `chain_log - 1` nodes).
1041    let wants_hash3 = matches!(
1042        params.strategy_tag,
1043        StrategyTag::BtUltra | StrategyTag::BtUltra2
1044    );
1045    let uses_bt = matches!(
1046        params.strategy_tag,
1047        StrategyTag::Btlazy2 | StrategyTag::BtOpt | StrategyTag::BtUltra | StrategyTag::BtUltra2
1048    );
1049    let tables = params.fast.map(|f| 4usize << f.hash_log).unwrap_or(0)
1050        + params
1051            .dfast
1052            .map(|d| (4usize << d.long_hash_log) + (4usize << d.short_hash_log))
1053            .unwrap_or(0)
1054        + params
1055            .hc
1056            .map(|h| {
1057                let hash3 = if wants_hash3 {
1058                    4usize
1059                        << super::match_table::storage::HC3_HASH_LOG.min(params.window_log as usize)
1060                } else {
1061                    0
1062                };
1063                (4usize << h.hash_log) + (4usize << h.chain_log) + hash3
1064            })
1065            .unwrap_or(0)
1066        + params
1067            .row
1068            .map(|r| (4usize << r.hash_bits) + (2usize << r.hash_bits))
1069            .unwrap_or(0);
1070    // BT modes box a `BtMatcher`; its retained scratch layout is budgeted
1071    // next to the struct so estimator and allocator evolve together.
1072    let bt = if uses_bt {
1073        super::bt::BtMatcher::estimated_workspace_bytes()
1074    } else {
1075        0
1076    };
1077    // Block staging: literal + sequence buffers plus the compressed-block
1078    // scratch, each bounded by the 128 KiB block size.
1079    let staging = 3 * (128 * 1024);
1080    window + tables + bt + staging
1081}
1082
1083/// Extra steady-state workspace the binary-tree strategies (ordinals 6..=9,
1084/// btlazy2..btultra2) retain beyond the hash/chain tables: the boxed matcher
1085/// plus its scratch arenas, and the HC3 short-match side table for
1086/// btultra/btultra2 (capped by the window log). 0 for non-BT ordinals.
1087pub fn estimated_bt_strategy_extra_bytes(strategy_ordinal: u32, window_log: u32) -> usize {
1088    if !(6..=9).contains(&strategy_ordinal) {
1089        return 0;
1090    }
1091    let hash3 = if matches!(strategy_ordinal, 8 | 9) {
1092        4usize << super::match_table::storage::HC3_HASH_LOG.min(window_log as usize)
1093    } else {
1094        0
1095    };
1096    super::bt::BtMatcher::estimated_workspace_bytes() + hash3
1097}
1098
1099/// Resolve a [`CompressionLevel`] to internal tuning parameters,
1100/// optionally adjusted for a known source size.
1101fn resolve_level_params(level: CompressionLevel, source_size: Option<u64>) -> LevelParams {
1102    if matches!(level, CompressionLevel::Level(22)) {
1103        return level22_btultra2_params_for_source_size(source_size);
1104    }
1105    let params = match level {
1106        CompressionLevel::Uncompressed => LevelParams {
1107            strategy_tag: super::strategy::StrategyTag::Fast,
1108            search: super::strategy::SearchMethod::Fast,
1109            // Uncompressed frames emit raw blocks and never reference
1110            // history; advertising a larger window only inflates
1111            // decoder-side buffer reservation. Stay at 17 (128 KiB).
1112            window_log: 17,
1113            lazy_depth: 0,
1114            // Beyond-upstream zstd: hash_log=14 (vs upstream zstd's 13) for 2× fewer
1115            // collisions on structured corpora. Upstream zstd's "base for negative"
1116            // row has targetLength=1 → step_size = 1 + 0 + 1 = 2.
1117            fast: Some(FastConfig {
1118                hash_log: 14,
1119                mls: 6,
1120                step_size: 2,
1121            }),
1122            dfast: None,
1123            hc: None,
1124            row: None,
1125        },
1126        CompressionLevel::Fastest => {
1127            // Only the Fast-specific cParams
1128            // (fast_hash_log / fast_mls / fast_step_size) align
1129            // with Uncompressed / negative-base row. window_log
1130            // stays at LEVEL_TABLE[0]'s value (19) — Fastest still
1131            // does real compression on a full window, unlike
1132            // Uncompressed which clamps to 17.
1133            let mut p = LEVEL_TABLE[0];
1134            p.fast = Some(FastConfig {
1135                hash_log: 14,
1136                mls: 6,
1137                step_size: 2,
1138            });
1139            p
1140        }
1141        CompressionLevel::Default => {
1142            // Default == Level(DEFAULT_LEVEL); tier it the same way an explicit
1143            // positive level is, so hinted default compression shrinks its
1144            // table widths on small / medium frames instead of keeping the
1145            // tier-0 row (the oversized-table page-fault pathology).
1146            let mut p = LEVEL_TABLE[CompressionLevel::DEFAULT_LEVEL as usize - 1];
1147            apply_cparams_tier(CompressionLevel::DEFAULT_LEVEL, source_size, &mut p);
1148            p
1149        }
1150        CompressionLevel::Better => LEVEL_TABLE[6],
1151        // Level 13: the first dominant point of the deep-lazy band. The
1152        // mls-wide row key lifted the shallow band's ratio enough that
1153        // level 11 no longer strictly beats level 7 on the ladder corpus;
1154        // the `Best` alias belongs on a config that dominates everything
1155        // below it rather than on a hair-thin margin.
1156        CompressionLevel::Best => LEVEL_TABLE[12],
1157        CompressionLevel::Level(n) => {
1158            if n > 0 {
1159                let idx = (n as usize).min(CompressionLevel::MAX_LEVEL as usize) - 1;
1160                let mut p = LEVEL_TABLE[idx];
1161                // Upstream zstd selects the cParams row from a 4-way
1162                // source-size-tiered table (`ZSTD_getCParams_internal` →
1163                // `ZSTD_defaultCParameters[tableID][level]`), and the Fast /
1164                // Dfast hashLog, chainLog and minMatch shrink for smaller
1165                // inputs. The `LEVEL_TABLE` base is the tier-0 (> 256 KiB) row;
1166                // override the table-shaping params per tier here so small and
1167                // medium frames use the reference's table widths (the oversized
1168                // tier-0 widths were a per-frame alloc / page-fault pathology on
1169                // medium inputs) and minMatch (short matches the wide hash
1170                // skips). NOTE: the reference also switches STRATEGY in some
1171                // tiers (L2 → dfast at 128..256 KiB, L4 → greedy at <= 16 KiB
1172                // and 128..256 KiB); those backend switches are not yet tiered,
1173                // so those tiers keep the base strategy.
1174                apply_cparams_tier(n, source_size, &mut p);
1175                p
1176            } else if n == 0 {
1177                // Level 0 = default, matching C zstd semantics. Tier it like the
1178                // `Default` alias so `Level(0)` and `Default` stay identical.
1179                let mut p = LEVEL_TABLE[CompressionLevel::DEFAULT_LEVEL as usize - 1];
1180                apply_cparams_tier(CompressionLevel::DEFAULT_LEVEL, source_size, &mut p);
1181                p
1182            } else {
1183                // Negative levels — upstream zstd sets
1184                // targetLength = -level (clampedCompressionLevel),
1185                // yielding step_size = (-level) + 1 since
1186                // !(targetLength) = 0 when targetLength > 0.
1187                // So L-1..L-7 get step_size 2..8. Acceleration
1188                // gradient comes from larger step skipping more
1189                // positions per iter (faster, worse ratio).
1190                // Clamp to upstream zstd's MIN_LEVEL before negating so
1191                // i32::MIN can't overflow on `-n`.
1192                let clamped = n.max(CompressionLevel::MIN_LEVEL);
1193                let target_length = (-clamped) as usize;
1194                let step_size = target_length + 1;
1195                // Upstream zstd row-0 ("base for negative", clevels.h srcSize>256KB):
1196                // hashLog=13, minMatch=7. The 32 KiB hash table (2^13 * 4B)
1197                // is L1d-resident on contemporary cores, so every probe is an
1198                // L1 hit; hashLog=14 (64 KiB) overflows a 32 KiB L1d and turns
1199                // each probe into an L2 access. minMatch=7 (vs 6) skips
1200                // short-distance 6-byte matches: fewer sequences, less
1201                // extension/emit work, and parity with the upstream zstd's negative
1202                // ladder on both ratio and throughput.
1203                LevelParams {
1204                    strategy_tag: super::strategy::StrategyTag::Fast,
1205                    search: super::strategy::SearchMethod::Fast,
1206                    window_log: 19,
1207                    lazy_depth: 0,
1208                    fast: Some(FastConfig {
1209                        hash_log: 13,
1210                        mls: 7,
1211                        step_size,
1212                    }),
1213                    dfast: None,
1214                    hc: None,
1215                    row: None,
1216                }
1217            }
1218        }
1219    };
1220    if let Some(size) = source_size {
1221        adjust_params_for_source_size(params, size)
1222    } else {
1223        params
1224    }
1225}
1226
1227/// The cheap fingerprint pre-splitter level for a compression level (the
1228/// C-like `blockSplitterLevel`), resolved through the same per-level
1229/// `LevelParams` table as every other tuning knob. `None` keeps the whole
1230/// 128 KiB block. The frame loop reads this instead of hardcoding the
1231/// level→split mapping at the call site.
1232pub(crate) fn level_pre_split(level: CompressionLevel) -> Option<usize> {
1233    // Resolve through `resolve_level_params` directly — NOT via the legacy
1234    // `numeric_level()` alias — so named presets read the SAME table row as
1235    // every other tuning knob (`Best` maps to its own row there, which is
1236    // not the row its numeric alias points at). `Uncompressed` (raw
1237    // blocks) never splits.
1238    if matches!(level, CompressionLevel::Uncompressed) {
1239        return None;
1240    }
1241    resolve_level_params(level, None)
1242        .pre_split()
1243        .map(usize::from)
1244}
1245
1246/// Backend storage for [`MatchGeneratorDriver`]. Exactly one match-finder
1247/// state lives in the driver at a time — the active variant. Backend
1248/// transitions in [`Matcher::reset`] drain the current variant's allocations
1249/// into the shared `vec_pool` and then replace `storage` with a freshly
1250/// constructed variant for the new backend.
1251///
1252/// Replaces the prior pattern of four parallel fields (`match_generator`,
1253/// `dfast_match_generator: Option<…>`, `row_match_generator: Option<…>`,
1254/// `hc_match_generator: Option<…>`) + an `active_backend: BackendTag`
1255/// discriminator: the parallel layout kept drained inner structures
1256/// allocated across backend switches, and every per-frame/per-slice
1257/// driver operation had to dispatch on `active_backend` to pick the
1258/// right field. A single enum collapses the storage and makes the
1259/// dispatcher pattern-match on the storage variant directly — same
1260/// number of arms, but `storage.backend()` is now the canonical source
1261/// of truth and dead variants are dropped when the active backend
1262/// changes.
1263#[derive(Clone)]
1264enum MatcherStorage {
1265    /// Upstream zstd `ZSTD_fast` family. Constructed by
1266    /// [`MatchGeneratorDriver::new`] as the initial variant and
1267    /// re-selected by [`Matcher::reset`] for any [`CompressionLevel`]
1268    /// that `resolve_level_params` maps to [`StrategyTag::Fast`]
1269    /// (`Uncompressed`, `Fastest`, `Level(1)`, and any non-positive
1270    /// `Level(n)` not equal to `0`).
1271    Simple(FastKernelMatcher),
1272    /// Upstream zstd `ZSTD_dfast` family — two-table hash chain. Selected for
1273    /// any level that resolves to [`StrategyTag::Dfast`] in
1274    /// `resolve_level_params` (`Default`, `Level(0)`, `Level(2)`,
1275    /// `Level(3)`).
1276    Dfast(DfastMatchGenerator),
1277    /// Upstream zstd `ZSTD_greedy` family with row hashing. Selected for any
1278    /// level that resolves to [`StrategyTag::Greedy`] (currently
1279    /// `Level(4)` only).
1280    Row(RowMatchGenerator),
1281    /// Upstream zstd `ZSTD_lazy2` and the BT-based optimal modes
1282    /// (`btopt` / `btultra` / `btultra2`). Selected for any level that
1283    /// resolves to [`StrategyTag::Lazy`], [`StrategyTag::BtOpt`],
1284    /// [`StrategyTag::BtUltra`], or [`StrategyTag::BtUltra2`]
1285    /// (`Better`, `Best`, `Level(5..=22)`, and any `Level(n)` with
1286    /// `n > MAX_LEVEL` — `resolve_level_params` clamps positive
1287    /// numeric levels at `MAX_LEVEL = 22` via
1288    /// `Level(n).clamp(1, MAX_LEVEL)`, so `Level(23..=i32::MAX)` all
1289    /// land on `BtUltra2` here). The [`HcMatchGenerator`]'s internal
1290    /// [`HcBackend`] discriminator decides whether BT scratch is
1291    /// allocated.
1292    HashChain(HcMatchGenerator),
1293}
1294
1295impl MatcherStorage {
1296    /// Heap bytes the active backend variant holds (tables, history, scratch).
1297    fn heap_size(&self) -> usize {
1298        match self {
1299            Self::Simple(m) => m.heap_size(),
1300            Self::Dfast(m) => m.heap_size(),
1301            Self::Row(m) => m.heap_size(),
1302            Self::HashChain(m) => m.heap_size(),
1303        }
1304    }
1305
1306    /// [`super::strategy::BackendTag`] family of the active variant.
1307    fn backend(&self) -> super::strategy::BackendTag {
1308        use super::strategy::BackendTag;
1309        match self {
1310            Self::Simple(_) => BackendTag::Simple,
1311            Self::Dfast(_) => BackendTag::Dfast,
1312            Self::Row(_) => BackendTag::Row,
1313            Self::HashChain(_) => BackendTag::HashChain,
1314        }
1315    }
1316}
1317
1318/// This is the default implementation of the `Matcher` trait. It allocates and reuses the buffers when possible.
1319pub struct MatchGeneratorDriver {
1320    vec_pool: Vec<Vec<u8>>,
1321    /// Active match-finder state. Exactly one backend lives here at a
1322    /// time; [`Matcher::reset`] drains the previous variant into
1323    /// `vec_pool` before swapping in a freshly constructed variant for
1324    /// the new backend. `storage.backend()` is the canonical source of
1325    /// truth for the parse family; `strategy_tag` carries the
1326    /// compile-time strategy chosen at the last `reset()`.
1327    storage: MatcherStorage,
1328    // Compile-time strategy tag resolved at `reset()` from the
1329    // requested `CompressionLevel`'s `LevelParams`. The driver's
1330    // hot-block dispatcher in `blocks/compressed.rs` matches on
1331    // this tag to enter the corresponding `Strategy`
1332    // monomorphisation (`compress_block::<S>`).
1333    strategy_tag: super::strategy::StrategyTag,
1334    // Decoupled search-method axis resolved at `reset()` from
1335    // `LevelParams.search`. The per-block dispatcher routes on this
1336    // (not on `strategy_tag`) so a level's parse and search backend can
1337    // be chosen independently. The `BinaryTree` arm still consults
1338    // `strategy_tag` to pick the opt `Strategy` ZST.
1339    search: super::strategy::SearchMethod,
1340    // Decoupled parse-mode axis resolved at `reset()` from
1341    // `LevelParams::parse()`. Independent of `search`: greedy / lazy /
1342    // lazy2 can run on any non-opt search backend. The backends still
1343    // read their own `lazy_depth` (kept in sync at `reset()`); this is
1344    // the authoritative parse selector for the dispatcher.
1345    parse: super::strategy::ParseMode,
1346    /// Test-only per-level recipe override applied in `reset()` before
1347    /// backend selection. Lets the parse×search matrix be exercised
1348    /// without editing `LEVEL_TABLE`; never compiled into production.
1349    #[cfg(test)]
1350    config_override: Option<(super::strategy::SearchMethod, super::strategy::ParseMode)>,
1351    /// Fine-grained per-knob overrides from the public
1352    /// [`super::parameters::CompressionParameters`] surface (#27).
1353    /// `None` (or an all-`None` [`super::parameters::ParamOverrides`])
1354    /// keeps the resolved level geometry byte-identical to plain
1355    /// level-based compression. Applied in [`Matcher::reset`] after the
1356    /// level params are resolved, before backend selection. Persists
1357    /// across resets (it is frame configuration, not a one-shot) until
1358    /// the caller changes it.
1359    param_overrides: Option<super::parameters::ParamOverrides>,
1360    slice_size: usize,
1361    base_slice_size: usize,
1362    // Frame header window size must stay at the configured live-window budget.
1363    // Dictionary retention expands internal matcher capacity only.
1364    reported_window_size: usize,
1365    // Tracks currently retained bytes that originated from primed dictionary
1366    // history and have not been evicted yet.
1367    dictionary_retained_budget: usize,
1368    // Source size hint for next frame (set via set_source_size_hint, cleared on reset).
1369    source_size_hint: Option<u64>,
1370    // Dictionary content size for the next frame (set via set_dictionary_size_hint,
1371    // consumed on reset). When present on a binary-tree / hash-chain backend, the
1372    // match-finder hash/chain tables are sized from the DICTIONARY (upstream zstd CDict
1373    // economics: a loaded dictionary supplies the long matches, so the live tables
1374    // can shrink to the dict's size tier) while the eviction window stays
1375    // source-sized. Mirrors upstream zstd `ZSTD_getCParamRowSize`, which picks the cParams
1376    // table column from `dictSize` for a dictionary-bearing compress.
1377    dictionary_size_hint: Option<usize>,
1378    // Normalized `ceil_log2` bucket of the frame's source-size hint, captured at
1379    // `reset` (where `source_size_hint` is consumed) via [`source_size_ceil_log`].
1380    // `None` means the frame was unhinted. Drives `prime_with_dictionary`'s upstream zstd
1381    // `ZSTD_shouldAttachDict` mode for the Simple/Fast backend: `None` (unknown)
1382    // or `<= FAST_ATTACH_DICT_CUTOFF_LOG` → attach (separate dict table, 2-cursor
1383    // `compress_block_fast_dict`); larger → copy (dictionary primed into the live
1384    // table, 4-cursor `compress_block_fast`). The primed-snapshot key is the
1385    // resolved shape ([`reset_shape`](Self::reset_shape)), not this bucket.
1386    reset_size_log: Option<u8>,
1387    // Whether the loaded dictionary fits the Fast attach path's tagged position
1388    // field (`<= MAX_FAST_ATTACH_DICT_REGION`). Captured at `reset` from the
1389    // dict-size hint (which equals the actual dict length on load) so the Fast
1390    // attach decision, the attach-epoch reset bit, and the primed-snapshot
1391    // `fast_attach` bit all gate on it consistently. `true` when there is no
1392    // dictionary (the attach path is then unused). A dict too large to tag falls
1393    // back to copy mode instead of overflowing the packed position.
1394    reset_dict_attach_ok: bool,
1395    // Hint-resolved matcher shape from the last `reset`: the [`LevelParams`], the
1396    // active backend's applied Dfast/Row hash-table width (`0` for HC/Fast), the
1397    // Fast attach-vs-copy mode, and the active LDM override (#27). Combined with
1398    // the frame's level into the [`PrimedKey`] that keys the primed snapshot, so
1399    // it is only restored into a reset that resolved the identical matcher AND
1400    // LDM configuration. `None` before the first `reset`.
1401    reset_shape: Option<(
1402        LevelParams,
1403        usize,
1404        bool,
1405        Option<super::parameters::LdmOverride>,
1406    )>,
1407    // One-shot borrowed block range `[start, end)` staged by the borrowed
1408    // Fast frame path (`set_borrowed_block`) for the NEXT
1409    // `start_matching` / `skip_matching_with_hint`. `Some` routes that
1410    // call to the Simple backend's borrowed scan instead of the owned
1411    // committed-block path; consumed (reset to `None`) by the routed
1412    // call. Always `None` on the owned streaming path.
1413    borrowed_pending: Option<(usize, usize)>,
1414    /// CDict-equivalent: snapshot of the post-prime matcher state taken
1415    /// once after the first dictionary prime — the backend `storage`
1416    /// (hash tables + dictionary history + offset history + window) plus
1417    /// the driver-level `dictionary_retained_budget`, the only two pieces
1418    /// `prime_with_dictionary` writes. Subsequent frames restore this
1419    /// (a table memcpy) instead of re-hashing every dictionary position,
1420    /// mirroring upstream zstd `ZSTD_compressBegin_usingCDict` copying the
1421    /// precomputed `cdict->matchState`. Invalidated when the dictionary
1422    /// changes; keyed by the [`PrimedKey`] resolved matcher shape so a snapshot
1423    /// is only restored into a reset that produces the same matcher — see
1424    /// `restore_primed_dictionary`.
1425    primed: Option<(MatcherStorage, usize, PrimedKey)>,
1426}
1427
1428/// Identity of the matcher configuration a primed snapshot was captured under:
1429/// the FULLY RESOLVED matcher shape, not the raw source-size hint.
1430///
1431/// `reset()` resolves the hint into a [`LevelParams`] (window_log cap, the
1432/// HC/Fast table and search geometry, the parse depth/target-length that get
1433/// baked into the restored `storage`) plus, for the Dfast/Row backends, a
1434/// table-width derived from the hint's ceil-log bucket. The mapping from hint
1435/// to resolved shape is many-to-one: the source-size adjustment is monotone in
1436/// `ceil_log2(hint)`, and Level 22 additionally collapses several buckets onto
1437/// one upstream zstd tier (its `<= 16/128/256 KiB` thresholds). Keying on the raw hint
1438/// (or even its ceil-log bucket) therefore over-keys — two hints that resolve
1439/// to the identical matcher would each force a full re-prime. Keying on the
1440/// resolved (`params`, `table_bits`) pair restores across them.
1441///
1442/// `table_bits` is the hint-dependent hash-table width the ACTIVE backend
1443/// applied (`set_hash_bits` value for Dfast/Row; `0` for HC/Fast, whose widths
1444/// already live in `params`). The snapshot is only ever captured on the COPY
1445/// path (a hinted, above-cutoff frame), so `table_bits` is always the resolved
1446/// Dfast/Row value there, never the unhinted default.
1447///
1448/// `level` is kept alongside the resolved `params` because some stored matcher
1449/// state is derived from the level DIRECTLY, not through `params`: e.g. Dfast's
1450/// `use_fast_loop` is true for L3 but false for L4, yet L3 and L4 resolve to
1451/// byte-identical `params`. Without `level` a snapshot captured at L3 could be
1452/// restored into an L4 reset, installing the wrong `use_fast_loop`.
1453///
1454/// `fast_attach` records the Fast backend's attach-vs-copy mode
1455/// ([`FAST_ATTACH_DICT_CUTOFF_LOG`]) because that cutoff (8 KiB) falls INSIDE a
1456/// single resolved shape: an 8192- and an 8193-byte Level 1 hint both clamp to
1457/// window_log 14 with identical `params`/`table_bits`, yet 8192 attaches (a
1458/// separate dict table) while 8193 copies into the live table — two different
1459/// `storage` shapes. The frame compressor only captures/restores snapshots on
1460/// the copy path today, but keying on the mode keeps the snapshot identity
1461/// self-sufficient rather than relying on that external gate.
1462///
1463/// Restoring a snapshot whose key differs would reinstate the old `storage`
1464/// (and its `max_window_size` / table dimensions / parse params / dict-table
1465/// shape) under a reset that resolved a different shape — the encoder could
1466/// then search past the frame header's window and emit an undecodable match.
1467/// All fields must match before a restore is allowed.
1468#[derive(Clone, Copy, PartialEq, Eq)]
1469struct PrimedKey {
1470    level: super::CompressionLevel,
1471    params: LevelParams,
1472    table_bits: usize,
1473    fast_attach: bool,
1474    /// Fine-grained LDM override (#27) active at capture time. The
1475    /// snapshot's cloned `storage` carries `BtMatcher::ldm_producer`,
1476    /// which is configured from this override; restoring a snapshot
1477    /// captured under a different LDM configuration (enable flip or
1478    /// changed knobs) would reinstate a stale producer. `params` already
1479    /// pins `window_log` / `strategy_tag` (the rest of the producer's
1480    /// identity), so folding the override completes the LDM identity.
1481    /// `None` = LDM off, matching `ParamOverrides::ldm`.
1482    ldm: Option<super::parameters::LdmOverride>,
1483}
1484
1485impl MatchGeneratorDriver {
1486    /// `slice_size` sets the base block allocation size used for matcher input chunks.
1487    /// `max_slices_in_window` determines the initial window capacity at construction
1488    /// time. Effective window sizing is recalculated on every [`reset`](Self::reset)
1489    /// from the resolved compression level and optional source-size hint.
1490    pub(crate) fn new(slice_size: usize, max_slices_in_window: usize) -> Self {
1491        // Validate inputs before deriving window_log_init. Three
1492        // failure modes need explicit guards:
1493        //
1494        // 1. Zero args → `max_window_size = 0` → silent 1-byte
1495        //    degenerate window (useless).
1496        // 2. Multiplication overflow on `slice_size *
1497        //    max_slices_in_window` → wraps silently in release.
1498        // 3. `next_power_of_two` overflow when the product is
1499        //    above `1 << (usize::BITS - 1)` → modern Rust PANICS
1500        //    on overflow (older Rust returned 0).
1501        //
1502        // Catch all three at construction with a clear domain-
1503        // specific message via `assert!` + `checked_mul` +
1504        // `checked_next_power_of_two`, rather than letting either
1505        // mode produce a silent degenerate matcher OR a generic
1506        // panic deep in `FastKernelMatcher::with_params`.
1507        assert!(
1508            slice_size > 0,
1509            "MatchGeneratorDriver::new requires slice_size > 0 (got 0)",
1510        );
1511        assert!(
1512            max_slices_in_window > 0,
1513            "MatchGeneratorDriver::new requires max_slices_in_window > 0 (got 0)",
1514        );
1515        let max_window_size = max_slices_in_window
1516            .checked_mul(slice_size)
1517            .expect("MatchGeneratorDriver::new: slice_size * max_slices_in_window overflows usize");
1518        // Derive an effective window_log for the initial-state matcher.
1519        // `MatchGeneratorDriver::new` runs BEFORE any reset, so it has
1520        // no LevelParams to consult — we initialise to whatever
1521        // window_log fits the caller's requested max_window_size
1522        // (round up to the next power of two via `next_power_of_two`'s
1523        // log). Reset() overwrites all three params from the resolved
1524        // LevelParams.
1525        //
1526        // `checked_next_power_of_two` returns `None` if the next power
1527        // of two would overflow `usize`. Modern Rust's
1528        // `next_power_of_two` PANICS on overflow rather than returning
1529        // 0 (the panic message is generic and unhelpful), so use the
1530        // checked variant to surface the failure with a clear,
1531        // domain-specific error.
1532        let next_pow2 = max_window_size.checked_next_power_of_two().expect(
1533            "MatchGeneratorDriver::new: max_window_size too large for \
1534             next_power_of_two without overflow",
1535        );
1536        let window_log_init = next_pow2.trailing_zeros() as u8;
1537        Self {
1538            vec_pool: Vec::new(),
1539            // Deferred table: `new` runs before any source size or resolved
1540            // LevelParams exist, so allocating at the level-default hash_log
1541            // here would be thrown away by the first frame's reset (which
1542            // clamps the window to the input and reallocs at the resolved
1543            // size). The deferral lets that first reset allocate exactly once.
1544            storage: MatcherStorage::Simple(FastKernelMatcher::with_params_deferred(
1545                window_log_init,
1546                FAST_LEVEL_1_HASH_LOG,
1547                FAST_LEVEL_1_MLS,
1548                2, // upstream zstd default step_size (targetLength=0 → step=2)
1549            )),
1550            strategy_tag: super::strategy::StrategyTag::Fast,
1551            search: super::strategy::SearchMethod::Fast,
1552            parse: super::strategy::ParseMode::Greedy,
1553            #[cfg(test)]
1554            config_override: None,
1555            param_overrides: None,
1556            slice_size,
1557            base_slice_size: slice_size,
1558            // Report the ROUNDED-UP window size that the matcher
1559            // actually carries (via `window_log_init = log2(next_pow2)`
1560            // → matcher's `max_window_size = 1 << window_log_init =
1561            // next_pow2`). For non-power-of-two `slice_size *
1562            // max_slices_in_window` inputs, the unrounded value
1563            // would under-report the active backend's window until
1564            // the first `reset()` overwrites both sides from the
1565            // resolved LevelParams.
1566            reported_window_size: next_pow2,
1567            reset_size_log: None,
1568            reset_dict_attach_ok: true,
1569            reset_shape: None,
1570            dictionary_retained_budget: 0,
1571            source_size_hint: None,
1572            dictionary_size_hint: None,
1573            borrowed_pending: None,
1574            primed: None,
1575        }
1576    }
1577
1578    fn level_params(level: CompressionLevel, source_size: Option<u64>) -> LevelParams {
1579        resolve_level_params(level, source_size)
1580    }
1581
1582    /// Install the public-parameter per-knob overrides (#27) applied at
1583    /// the next [`Matcher::reset`]. `None` (or an all-`None` set) restores
1584    /// plain level-based geometry. Persists across resets until changed.
1585    pub(crate) fn set_param_overrides(
1586        &mut self,
1587        overrides: Option<super::parameters::ParamOverrides>,
1588    ) {
1589        self.param_overrides = overrides;
1590    }
1591
1592    /// Active backend family derived from the storage variant. Single
1593    /// source of truth — no separate runtime tag to drift against.
1594    pub(crate) fn active_backend(&self) -> super::strategy::BackendTag {
1595        self.storage.backend()
1596    }
1597
1598    /// Whether the borrowed (no-copy, in-place over-window) scan is
1599    /// implemented for the current backend + search configuration. The
1600    /// HashChain backend serves both the lazy CHAIN parser
1601    /// (`SearchMethod::HashChain`) and the BT/optimal parsers
1602    /// (`SearchMethod::BinaryTree`); only the lazy chain has a borrowed scan
1603    /// so far, so BT/optimal stay on the owned path.
1604    pub(crate) fn borrowed_supported(&self) -> bool {
1605        use super::strategy::{BackendTag, SearchMethod, StrategyTag};
1606        match self.active_backend() {
1607            BackendTag::Simple | BackendTag::Dfast | BackendTag::Row => true,
1608            // The HashChain backend covers two searches: the lazy CHAIN parser
1609            // (borrowed-capable) and the BINARY-TREE search (btlazy2 L13-15 +
1610            // optimal BtOpt/BtUltra/BtUltra2 L16-22). btlazy2's BT-tree borrowed
1611            // scan is byte-identical to owned (reads via live_history()), so it
1612            // takes the in-place path. The OPTIMAL parsers stay owned: their
1613            // cost-based DP is sensitive to candidate quality, and the borrowed
1614            // continuous-index scan yields slightly different (ratio-worse)
1615            // candidates than the owned evict+rehash scan — borrowed optimal
1616            // both diverged from owned and fell outside the ffi ratio bound.
1617            // Search-aware (not just strategy_tag) so optimal BT can never be
1618            // staged on the borrowed path even via an internal caller.
1619            BackendTag::HashChain => match self.search {
1620                SearchMethod::HashChain => true,
1621                SearchMethod::BinaryTree => matches!(self.strategy_tag, StrategyTag::Btlazy2),
1622                _ => false,
1623            },
1624        }
1625    }
1626
1627    /// Whether a DICTIONARY frame can take the borrowed (no input copy) path.
1628    /// Only the Simple (Fast) backend with the dictionary ATTACHED (not the
1629    /// copy/merge regime) has a borrowed dict scan — `start_matching_borrowed_dict`
1630    /// reads live matches from the borrowed input in place and dict matches
1631    /// from the committed dict prefix via the 2-segment counter. Every other
1632    /// backend, and copy-mode (large-input) dict frames, stay on the owned
1633    /// path. Checked AFTER priming, so `is_attached()` reflects the resolved
1634    /// attach-vs-copy decision.
1635    pub(crate) fn borrowed_dict_supported(&self) -> bool {
1636        matches!(
1637            &self.storage,
1638            MatcherStorage::Simple(m) if m.dict_is_attached()
1639        )
1640    }
1641
1642    fn simple_mut(&mut self) -> &mut FastKernelMatcher {
1643        match &mut self.storage {
1644            MatcherStorage::Simple(m) => m,
1645            _ => panic!("simple backend must be initialized by reset() before use"),
1646        }
1647    }
1648
1649    /// Reclaim the per-block input buffer that the Simple backend
1650    /// just spent inside `start_matching` / `skip_matching_with_hint`.
1651    ///
1652    /// `FastKernelMatcher::take_recycled_space` returns the cleared
1653    /// (capacity-retained) `Vec<u8>` from the last
1654    /// `extend_history_with_pending`. We push it onto `vec_pool`
1655    /// as-is (with `len = 0`); `get_next_space()` is responsible for
1656    /// resizing the buffer back to `slice_size` on its next pop. The
1657    /// pushed length is irrelevant — only the capacity matters, and
1658    /// `extend_history_with_pending` preserves it. Without this
1659    /// recycle path, the Simple backend would allocate a new
1660    /// `Vec<u8>` per block — a measurable hot-path cost when blocks
1661    /// are small (~128 KiB) and processed at hundreds of MiB/s.
1662    fn recycle_simple_space(&mut self) {
1663        if let Some(space) = self.simple_mut().take_recycled_space() {
1664            // `space` is already cleared (len = 0) by
1665            // `extend_history_with_pending`; capacity is retained.
1666            // Leaving `len = 0` here avoids the cost of zero-filling
1667            // the entire allocation — `get_next_space()` resizes the
1668            // popped buffer up to `slice_size` on demand, so the
1669            // length the pool holds is irrelevant. This matters most
1670            // after a small-source-size hint has shrunk `slice_size`
1671            // mid-frame: the recycled buffer can be much larger than
1672            // the current `slice_size`, and zero-filling 128 KiB+ on
1673            // every block would erase the perf win the recycle path
1674            // is meant to deliver.
1675            self.vec_pool.push(space);
1676        }
1677    }
1678
1679    /// Register a caller-owned input buffer as the Simple backend's
1680    /// borrowed one-shot match window. Only valid on the Simple (Fast)
1681    /// backend; the one-shot frame path gates on that before calling.
1682    ///
1683    /// # Safety
1684    /// Same contract as [`FastKernelMatcher::set_borrowed_window`]: the
1685    /// buffer must stay live and unmodified until the window is cleared,
1686    /// and must be cleared before the buffer is dropped or the matcher is
1687    /// reused for another frame.
1688    pub(crate) unsafe fn set_borrowed_window(&mut self, buffer: &[u8]) {
1689        // SAFETY: forwarded contract — caller upholds liveness/clear.
1690        match self.active_backend() {
1691            super::strategy::BackendTag::Simple => unsafe {
1692                self.simple_mut().set_borrowed_window(buffer)
1693            },
1694            super::strategy::BackendTag::Dfast => unsafe {
1695                self.dfast_matcher_mut().set_borrowed_window(buffer)
1696            },
1697            super::strategy::BackendTag::Row => unsafe {
1698                self.row_matcher_mut().set_borrowed_window(buffer)
1699            },
1700            super::strategy::BackendTag::HashChain => unsafe {
1701                self.hc_matcher_mut().set_borrowed_window(buffer)
1702            },
1703        }
1704    }
1705
1706    /// Clear the borrowed one-shot window, returning the active backend
1707    /// to the owned `history` path.
1708    pub(crate) fn clear_borrowed_window(&mut self) {
1709        match self.active_backend() {
1710            super::strategy::BackendTag::Simple => self.simple_mut().clear_borrowed_window(),
1711            super::strategy::BackendTag::Dfast => self.dfast_matcher_mut().clear_borrowed_window(),
1712            super::strategy::BackendTag::Row => self.row_matcher_mut().clear_borrowed_window(),
1713            super::strategy::BackendTag::HashChain => self.hc_matcher_mut().clear_borrowed_window(),
1714            #[allow(unreachable_patterns)]
1715            _ => {}
1716        }
1717        self.borrowed_pending = None;
1718    }
1719
1720    /// Stage the borrowed block range `[block_start, block_end)` for the
1721    /// NEXT `start_matching` / `skip_matching_with_hint`, which the
1722    /// borrowed Fast frame path uses in place of `commit_space`. While
1723    /// staged, those trait calls route to the Simple backend's borrowed
1724    /// scan/skip (consuming the stage) instead of the owned committed
1725    /// block. See [`Matcher::start_matching`] /
1726    /// [`Matcher::skip_matching_with_hint`] on this type.
1727    pub(crate) fn set_borrowed_block(&mut self, block_start: usize, block_end: usize) {
1728        assert!(
1729            self.borrowed_supported(),
1730            "borrowed block staging is not supported for the active backend/search config",
1731        );
1732        assert!(
1733            block_start <= block_end,
1734            "borrowed block range must satisfy start <= end (start={block_start} end={block_end})",
1735        );
1736        self.borrowed_pending = Some((block_start, block_end));
1737        // Make the range visible to `get_last_space()` immediately: the
1738        // emit pipeline reads `get_last_space().len()` in
1739        // `collect_block_parts` BEFORE `start_matching` consumes the
1740        // stage, so the staged block (not the whole borrowed window) must
1741        // be reported now to keep the literal-buffer reservation right.
1742        match self.active_backend() {
1743            super::strategy::BackendTag::Simple => self
1744                .simple_mut()
1745                .stage_borrowed_block(block_start, block_end),
1746            super::strategy::BackendTag::Dfast => self
1747                .dfast_matcher_mut()
1748                .stage_borrowed_block(block_start, block_end),
1749            super::strategy::BackendTag::Row => self
1750                .row_matcher_mut()
1751                .stage_borrowed_block(block_start, block_end),
1752            super::strategy::BackendTag::HashChain => self
1753                .hc_matcher_mut()
1754                .table
1755                .stage_borrowed_block(block_start, block_end),
1756        }
1757    }
1758
1759    #[cfg(test)]
1760    fn dfast_matcher(&self) -> &DfastMatchGenerator {
1761        match &self.storage {
1762            MatcherStorage::Dfast(m) => m,
1763            _ => panic!("dfast backend must be initialized by reset() before use"),
1764        }
1765    }
1766
1767    fn dfast_matcher_mut(&mut self) -> &mut DfastMatchGenerator {
1768        match &mut self.storage {
1769            MatcherStorage::Dfast(m) => m,
1770            _ => panic!("dfast backend must be initialized by reset() before use"),
1771        }
1772    }
1773
1774    #[cfg(test)]
1775    fn row_matcher(&self) -> &RowMatchGenerator {
1776        match &self.storage {
1777            MatcherStorage::Row(m) => m,
1778            _ => panic!("row backend must be initialized by reset() before use"),
1779        }
1780    }
1781
1782    fn row_matcher_mut(&mut self) -> &mut RowMatchGenerator {
1783        match &mut self.storage {
1784            MatcherStorage::Row(m) => m,
1785            _ => panic!("row backend must be initialized by reset() before use"),
1786        }
1787    }
1788
1789    #[cfg(test)]
1790    fn hc_matcher(&self) -> &HcMatchGenerator {
1791        match &self.storage {
1792            MatcherStorage::HashChain(m) => m,
1793            _ => panic!("hash chain backend must be initialized by reset() before use"),
1794        }
1795    }
1796
1797    fn hc_matcher_mut(&mut self) -> &mut HcMatchGenerator {
1798        match &mut self.storage {
1799            MatcherStorage::HashChain(m) => m,
1800            _ => panic!("hash chain backend must be initialized by reset() before use"),
1801        }
1802    }
1803
1804    /// Shrink the active backend's `max_window_size` by the bytes
1805    /// reclaimed from the dictionary-retention budget. Returns `true`
1806    /// iff any reclamation happened — the caller uses that as the
1807    /// gate for [`Self::trim_after_budget_retire`] (which is a no-op
1808    /// otherwise: with `max_window_size` unchanged the backend's
1809    /// `trim_to_window` cannot find anything to evict, so calling it
1810    /// just runs an extra `match` ladder + a single early-out check
1811    /// per slice commit).
1812    #[must_use]
1813    fn retire_dictionary_budget(&mut self, evicted_bytes: usize) -> bool {
1814        let reclaimed = evicted_bytes.min(self.dictionary_retained_budget);
1815        if reclaimed == 0 {
1816            return false;
1817        }
1818        self.dictionary_retained_budget -= reclaimed;
1819        match self.active_backend() {
1820            super::strategy::BackendTag::Simple => {
1821                let matcher = self.simple_mut();
1822                // `reclaimed` can exceed the CURRENT `max_window_size`: the
1823                // retained dict budget is tracked independently and the
1824                // window may already have been shrunk by a prior eviction,
1825                // so the floor at 0 is the correct clamp, not a masked bug.
1826                matcher.max_window_size = matcher.max_window_size.saturating_sub(reclaimed);
1827            }
1828            super::strategy::BackendTag::Dfast => {
1829                let matcher = self.dfast_matcher_mut();
1830                // `reclaimed` can exceed the CURRENT `max_window_size`: the
1831                // retained dict budget is tracked independently and the
1832                // window may already have been shrunk by a prior eviction,
1833                // so the floor at 0 is the correct clamp, not a masked bug.
1834                matcher.max_window_size = matcher.max_window_size.saturating_sub(reclaimed);
1835            }
1836            super::strategy::BackendTag::Row => {
1837                let matcher = self.row_matcher_mut();
1838                // `reclaimed` can exceed the CURRENT `max_window_size`: the
1839                // retained dict budget is tracked independently and the
1840                // window may already have been shrunk by a prior eviction,
1841                // so the floor at 0 is the correct clamp, not a masked bug.
1842                matcher.max_window_size = matcher.max_window_size.saturating_sub(reclaimed);
1843            }
1844            super::strategy::BackendTag::HashChain => {
1845                let matcher = self.hc_matcher_mut();
1846                // See the Simple arm: `reclaimed` may exceed the current
1847                // window, so saturating to 0 is the correct clamp.
1848                matcher.table.max_window_size =
1849                    matcher.table.max_window_size.saturating_sub(reclaimed);
1850            }
1851        }
1852        true
1853    }
1854
1855    fn trim_after_budget_retire(&mut self) {
1856        loop {
1857            let mut evicted_bytes = 0usize;
1858            match self.active_backend() {
1859                super::strategy::BackendTag::Simple => {
1860                    // FastKernelMatcher owns its history as a single
1861                    // flat `Vec<u8>` (upstream zstd's flat-buffer layout)
1862                    // rather than the legacy per-block `WindowEntry`
1863                    // stack. There are no per-block Vec allocations
1864                    // to recycle into `vec_pool` — `trim_to_window`
1865                    // drains the oldest bytes in-place and returns
1866                    // the count for the dictionary-budget loop's
1867                    // termination check.
1868                    let MatcherStorage::Simple(m) = &mut self.storage else {
1869                        unreachable!("active_backend() == Simple proven above");
1870                    };
1871                    evicted_bytes += m.trim_to_window();
1872                }
1873                super::strategy::BackendTag::Dfast => {
1874                    // Dfast doesn't retain input Vecs — `history` is the
1875                    // only byte store, so there is no per-block buffer
1876                    // to push back through a callback. Eviction byte
1877                    // count is derived from the `window_size` delta
1878                    // before/after; the Dfast variant of
1879                    // `trim_to_window` takes no closure, sidestepping
1880                    // an unused-`impl FnMut` monomorphization that
1881                    // would otherwise contractually never fire.
1882                    let dfast = self.dfast_matcher_mut();
1883                    let pre = dfast.window_size;
1884                    dfast.trim_to_window();
1885                    evicted_bytes += pre - dfast.window_size;
1886                }
1887                super::strategy::BackendTag::Row => {
1888                    // Row keeps bytes only in the contiguous `history` mirror
1889                    // (block buffers are returned to the pool per block in
1890                    // `add_data`), so derive the eviction count from the
1891                    // `window_size` delta, mirroring the Dfast / HashChain arms.
1892                    let row = self.row_matcher_mut();
1893                    let pre = row.window_size;
1894                    row.trim_to_window();
1895                    evicted_bytes += pre - row.window_size;
1896                }
1897                super::strategy::BackendTag::HashChain => {
1898                    // HC keeps bytes only in the contiguous `history` mirror
1899                    // (no per-block Vecs to recycle since the window<->history
1900                    // dedup), so derive the eviction count from the
1901                    // `window_size` delta, mirroring the Dfast arm above.
1902                    let table = &mut self.hc_matcher_mut().table;
1903                    let pre = table.window_size;
1904                    table.trim_to_window();
1905                    evicted_bytes += pre - table.window_size;
1906                }
1907            }
1908            if evicted_bytes == 0 {
1909                break;
1910            }
1911            // The loop's invariant is "the backend's previous
1912            // `max_window_size` shrink had downstream bytes left to
1913            // evict" — that's what `evicted_bytes != 0` proves at
1914            // this point. `dictionary_retained_budget` is NOT
1915            // guaranteed to be positive here: the outer
1916            // `retire_dictionary_budget` call may have already
1917            // drained it to zero by reclaiming the last retained
1918            // bytes, while the backend still has bytes above the
1919            // freshly-shrunk window cap waiting for this loop to
1920            // evict. The return value of the retire call below is
1921            // therefore intentionally discarded — the loop's
1922            // termination is driven by `evicted_bytes == 0`, not by
1923            // whether the budget has more bytes left to reclaim.
1924            let _ = self.retire_dictionary_budget(evicted_bytes);
1925        }
1926    }
1927
1928    /// ATTACH (`true`) vs COPY (`false`) decision for the dms-bearing HashChain
1929    /// backend (lazy hash-chain AND binary-tree/optimal levels), mirroring
1930    /// upstream `ZSTD_shouldAttachDict` and its per-strategy `attachDictSizeCutoffs`:
1931    /// a small / unknown source ATTACHES the dict as a separate dms (hash-chain
1932    /// dms for lazy, DUBT dms for BT); a large known source COPIES it into the
1933    /// live chain / tree. The cutoff is the lazy/lazy2 value for HC, the
1934    /// btlazy2/btopt value for Bt{Opt}, and the smaller btultra/btultra2 value for
1935    /// the deepest parses. Both `skip_matching_for_dictionary_priming` (which
1936    /// stages the dict) and `prime_with_dictionary` (which builds-or-drops the
1937    /// dms) read this so the two stay in lock-step.
1938    fn hc_dict_attach_mode(&self) -> bool {
1939        // Only the HashChain backend (lazy hash-chain + BT/optimal) routes here;
1940        // a non-HashChain storage has no dms decision, so default to attach.
1941        let MatcherStorage::HashChain(hc) = &self.storage else {
1942            return true;
1943        };
1944        let cutoff = if hc.table.uses_bt {
1945            match hc.strategy_tag {
1946                super::strategy::StrategyTag::BtUltra | super::strategy::StrategyTag::BtUltra2 => {
1947                    BT_ULTRA_ATTACH_DICT_CUTOFF_LOG
1948                }
1949                _ => BT_OPT_ATTACH_DICT_CUTOFF_LOG,
1950            }
1951        } else {
1952            HC_ATTACH_DICT_CUTOFF_LOG
1953        };
1954        self.reset_size_log.is_none_or(|log| log <= cutoff)
1955    }
1956
1957    fn skip_matching_for_dictionary_priming(&mut self) {
1958        match self.active_backend() {
1959            super::strategy::BackendTag::Simple => {
1960                // Upstream zstd `ZSTD_shouldAttachDict` mode selection for the Fast
1961                // strategy (cutoff 8 KB): small / unknown-size inputs ATTACH
1962                // (index dict positions into a SEPARATE immutable table; the
1963                // dual-probe 2-cursor `compress_block_fast_dict` then prefers
1964                // recent-input matches and falls back to the dict — the path
1965                // that wins small/unknown). Large known-size inputs COPY (prime
1966                // dict into the live table; the 4-cursor `compress_block_fast`
1967                // matches against it as window history — the path that already
1968                // matches/beats the upstream zstd on large corpora). The dispatch in
1969                // `start_matching` keys off `dict_table.is_some()`, which only
1970                // the attach path populates. See [`FAST_ATTACH_DICT_CUTOFF_LOG`].
1971                let attach = self.reset_dict_attach_ok
1972                    && self
1973                        .reset_size_log
1974                        .is_none_or(|log| log <= FAST_ATTACH_DICT_CUTOFF_LOG);
1975                if attach {
1976                    self.simple_mut().skip_matching_for_dict_prime();
1977                } else {
1978                    self.simple_mut().skip_matching_with_hint(Some(false));
1979                }
1980                self.recycle_simple_space();
1981            }
1982            super::strategy::BackendTag::Dfast => {
1983                // Upstream zstd `ZSTD_dictMatchState` mode selection for dfast (cutoff
1984                // 16 KiB): small / unknown-size inputs ATTACH (build the
1985                // separate immutable dict long+short tables; the dual-probe
1986                // `start_matching_fast_loop` searches live + dict, the path that
1987                // avoids the per-frame dict re-prime that dominates small
1988                // `compress-dict`). Larger known-size inputs COPY (re-prime the
1989                // dict into the live tables via `skip_matching_dense`, where the
1990                // dense scan matches it as window history). `skip_matching_for_dict_attach`
1991                // self-gates on `use_fast_loop` (only fast-loop levels carry the
1992                // dual-probe; general-path levels fall back to the dense copy).
1993                let attach = self
1994                    .reset_size_log
1995                    .is_none_or(|log| log <= DFAST_ATTACH_DICT_CUTOFF_LOG);
1996                if attach {
1997                    self.dfast_matcher_mut().skip_matching_for_dict_attach();
1998                } else {
1999                    self.dfast_matcher_mut().invalidate_dict_cache();
2000                    self.dfast_matcher_mut().skip_matching_dense();
2001                }
2002            }
2003            super::strategy::BackendTag::Row => {
2004                // Upstream zstd `ZSTD_RowFindBestMatch` `dictMatchState`: small /
2005                // unknown-size inputs ATTACH (build the separate immutable dict
2006                // row index; the bounded dual-probe in `row_candidate_rl`
2007                // searches live + dict, avoiding the per-frame dict re-index),
2008                // larger known-size inputs COPY (dense re-prime into the live
2009                // rows).
2010                let attach = self
2011                    .reset_size_log
2012                    .is_none_or(|log| log <= ROW_ATTACH_DICT_CUTOFF_LOG);
2013                if attach {
2014                    self.row_matcher_mut().prime_dict_attach_current_block();
2015                } else {
2016                    self.row_matcher_mut().invalidate_dict_cache();
2017                    self.row_matcher_mut().skip_matching_with_hint(Some(false));
2018                }
2019            }
2020            super::strategy::BackendTag::HashChain => {
2021                // Lazy-HC AND BT/optimal both follow upstream zstd `ZSTD_shouldAttachDict`
2022                // per-strategy: ATTACH (a separate dms — hash-chain dms for lazy,
2023                // DUBT dms for BT) for small / unknown inputs, COPY (merge the dict
2024                // into the live chain/tree) for large known inputs. ATTACH keeps
2025                // the dict in history but out of the live structure via
2026                // `skip_matching_dict_bt` (the cursor advance is shared by both
2027                // arms); COPY routes through the normal `skip_matching` (its
2028                // `uses_bt` branch fills the live tree, the lazy branch the live
2029                // chain). The dms is built-or-dropped to match in
2030                // `prime_with_dictionary`.
2031                if self.hc_dict_attach_mode() {
2032                    self.hc_matcher_mut().table.skip_matching_dict_bt();
2033                } else {
2034                    self.hc_matcher_mut().skip_matching(Some(false));
2035                }
2036            }
2037        }
2038    }
2039}
2040
2041impl Matcher for MatchGeneratorDriver {
2042    fn supports_dictionary_priming(&self) -> bool {
2043        true
2044    }
2045
2046    fn set_source_size_hint(&mut self, size: u64) {
2047        self.source_size_hint = Some(size);
2048    }
2049
2050    fn set_dictionary_size_hint(&mut self, size: usize) {
2051        self.dictionary_size_hint = Some(size);
2052    }
2053
2054    /// Dict-relevance gate for the raw-fast-path. Reached only when a dictionary
2055    /// is active (the caller short-circuits on `dict_active`), so this answers
2056    /// "could the dict compress this otherwise-incompressible-looking block?".
2057    /// The Simple (Fast) backend samples its dict table precisely
2058    /// ([`FastKernelMatcher::block_samples_match_dict`]); the other backends
2059    /// (Dfast / Row / HashChain / BT) have their own dict structures and no cheap
2060    /// probe here, so they answer CONSERVATIVELY `true`: without a probe they
2061    /// cannot tell whether the dict compresses an incompressible-LOOKING block,
2062    /// and answering `false` would let the raw-fast-path emit such a block raw
2063    /// and miss an embedded dict segment. `dictionary_segment_in_incompressible_input_is_matched`
2064    /// pins this for Dfast/Row/BT — the 512-byte dict run inside high-entropy
2065    /// filler is matched only because these backends stay on the scan. So they
2066    /// keep the blanket scan the old `!dict_active` gate gave them; only the
2067    /// Simple/Fast backend trades it for the precise probe.
2068    fn block_samples_match_dict(&self, block: &[u8]) -> bool {
2069        match &self.storage {
2070            MatcherStorage::Simple(m) => m.block_samples_match_dict(block),
2071            _ => true,
2072        }
2073    }
2074
2075    /// Heap bytes this driver owns: the active backend's tables/history, the
2076    /// recycled input-buffer pool, and the primed-dictionary snapshot (a cloned
2077    /// backend kept for CDict-equivalent reuse). The inline struct itself is
2078    /// accounted by the owner's `size_of`.
2079    fn heap_size(&self) -> usize {
2080        let pool: usize = self.vec_pool.capacity() * core::mem::size_of::<Vec<u8>>()
2081            + self.vec_pool.iter().map(Vec::capacity).sum::<usize>();
2082        let snapshot = self
2083            .primed
2084            .as_ref()
2085            .map_or(0, |(storage, _, _)| storage.heap_size());
2086        pool + self.storage.heap_size() + snapshot
2087    }
2088
2089    fn clear_param_overrides(&mut self) {
2090        self.param_overrides = None;
2091    }
2092
2093    fn reset(&mut self, level: CompressionLevel) {
2094        let hint = self.source_size_hint.take();
2095        let dict_hint = self.dictionary_size_hint.take();
2096        // Snapshot the hint's normalized ceil-log bucket for the primed-snapshot
2097        // key and prime_with_dictionary's attach/copy mode decision (the hint is
2098        // consumed here, but priming happens just after reset). Storing the
2099        // bucket rather than the raw bytes means two hints that resolve to the
2100        // same matcher shape share one snapshot instead of each re-priming.
2101        self.reset_size_log = hint.map(source_size_ceil_log);
2102        // A dictionary too large for the tagged attach position field falls back
2103        // to copy mode. Captured here (from the load-set size hint = actual dict
2104        // length) so the prime decision and the snapshot-key / epoch bits agree.
2105        self.reset_dict_attach_ok =
2106            dict_hint.is_none_or(|size| size <= MAX_FAST_ATTACH_DICT_REGION);
2107        let hinted = hint.is_some();
2108        #[cfg_attr(not(test), allow(unused_mut))]
2109        let mut params = Self::level_params(level, hint);
2110        // Test-only: apply a parse×search override so the matrix can be
2111        // exercised without editing `LEVEL_TABLE`. Mutating `params` here
2112        // (before `next_backend`) flows the override through storage
2113        // selection, `configure`, and the `self.search`/`self.parse`
2114        // writes uniformly. Consumed with `take()` so it is one-shot: the
2115        // synthetic pairing applies to exactly this `reset()`, and a later
2116        // reset on the same driver falls back to the level's real config.
2117        #[cfg(test)]
2118        if let Some((search, parse)) = self.config_override.take() {
2119            params.search = search;
2120            params.lazy_depth = parse.lazy_depth();
2121            // The matrix sweep can pair a level with a backend its native
2122            // row doesn't populate (e.g. greedy L5, which carries only `row`,
2123            // run on HashChain). Synthesize a default config for the
2124            // overridden backend so its `configure` arm has something to read.
2125            use super::strategy::SearchMethod;
2126            match search {
2127                SearchMethod::Fast => {
2128                    params.fast.get_or_insert(FAST_L1);
2129                }
2130                SearchMethod::DoubleFast => {
2131                    params.dfast.get_or_insert(DFAST_L3);
2132                }
2133                SearchMethod::RowHash => {
2134                    params.row.get_or_insert(ROW_CONFIG);
2135                }
2136                SearchMethod::HashChain | SearchMethod::BinaryTree => {
2137                    params.hc.get_or_insert(HC_CONFIG);
2138                }
2139            }
2140        }
2141        // Public-parameter overrides (#27): apply the per-knob set on top
2142        // of the level-resolved params. A strategy override re-routes the
2143        // backend, so this must precede `next_backend` selection. The
2144        // all-`None` case is skipped so default level geometry stays
2145        // byte-identical to plain level-based compression.
2146        if let Some(ov) = self.param_overrides
2147            && !ov.is_empty()
2148        {
2149            apply_param_overrides(&mut params, &ov);
2150            // `Self::level_params(level, hint)` applied the source-size cap
2151            // for the LEVEL's native backend. If a strategy override moved
2152            // the frame onto a different backend, `apply_param_overrides`
2153            // synthesized that backend's DEFAULT config (FAST_L1 /
2154            // HC_OVERRIDE_DEFAULT) with full-size table logs AFTER that cap
2155            // ran. Re-apply the hint cap so a tiny hinted frame doesn't
2156            // allocate the new backend's full-size tables. An explicit
2157            // `window_log` override is the user's hard request and must
2158            // survive the re-cap, so restore it afterwards.
2159            if let Some(hint_size) = hint {
2160                params = adjust_params_for_source_size(params, hint_size);
2161                if let Some(window_log) = ov.window_log {
2162                    params.window_log = window_log;
2163                }
2164            }
2165        }
2166        // Dictionary-driven table sizing — parity with upstream zstd `ZSTD_createCDict`
2167        // (`ZSTD_getCParams_internal(level, UNKNOWN, dictSize, ZSTD_cpm_createCDict)`
2168        // → `ZSTD_adjustCParams_internal`). A loaded dictionary supplies the
2169        // long-distance matches, so upstream zstd sizes the prepared match-finder tables
2170        // to the DICTIONARY (assuming a `minSrcSize` source), not the live
2171        // window: it downsizes `hashLog`/`chainLog` toward the dict-and-window
2172        // log while leaving the frame's eviction `window_log` source-derived so
2173        // the dictionary bytes stay referenceable (`ZSTD_resetCCtx_byCopyingCDict`
2174        // copies the small CDict tables but keeps the source window). We apply
2175        // the same downsizing to the level's own hc geometry and cap (min) so a
2176        // dict never inflates the level tables. Only the binary-tree / hash-chain
2177        // backend reads `hc.{hash,chain}_log`; Simple/Dfast/Row derive their
2178        // widths from the source window in their `reset` arms.
2179        // A zero-length dictionary is "no dictionary": running the CDict sizing
2180        // path for `Some(0)` is not a no-op — `cdict_table_logs(.., 0)` still
2181        // collapses the HC/BT tables toward the 513-byte upstream zstd tier via
2182        // `DICT_MIN_SRC_SIZE`, tanking ratio/perf on the next frame. Priming
2183        // already treats empty content as empty, so skip the downsizing here too.
2184        if let Some(dict_size) = dict_hint.filter(|&size| size > 0) {
2185            // Derive the dict-tier geometry from the level's FULL (un-source-capped)
2186            // hc widths. `Self::level_params(level, hint)` already source-capped
2187            // `params.hc`; feeding those capped widths into `cdict_table_logs` and
2188            // then `.min()`-ing would double-cap, so on a small hinted source with a
2189            // large dictionary the prepared tables collapse below what the dict needs
2190            // — defeating the `ZSTD_createCDict` geometry this mirrors. Take the
2191            // un-hinted base widths instead and assign the result directly:
2192            // `cdict_table_logs` only ever downsizes, so it never exceeds the base
2193            // level geometry, while the eviction `window_log` stays source-derived so
2194            // the dictionary bytes remain referenceable. Active public-parameter
2195            // overrides (#27) are applied to the base too, so a strategy override
2196            // that routes onto HashChain/BinaryTree still gets dict-tier sizing and
2197            // explicit hash/chain overrides feed through as the geometry ceiling.
2198            let mut base_params = Self::level_params(level, None);
2199            if let Some(ov) = self.param_overrides
2200                && !ov.is_empty()
2201            {
2202                apply_param_overrides(&mut base_params, &ov);
2203            }
2204            if let (Some(hc), Some(base_hc)) = (params.hc.as_mut(), base_params.hc) {
2205                let uses_bt = matches!(
2206                    params.strategy_tag,
2207                    super::strategy::StrategyTag::Btlazy2
2208                        | super::strategy::StrategyTag::BtOpt
2209                        | super::strategy::StrategyTag::BtUltra
2210                        | super::strategy::StrategyTag::BtUltra2
2211                );
2212                let (dict_hash_log, dict_chain_log) = cdict_table_logs(
2213                    params.window_log,
2214                    base_hc.hash_log,
2215                    base_hc.chain_log,
2216                    uses_bt,
2217                    dict_size,
2218                );
2219                hc.hash_log = dict_hash_log;
2220                hc.chain_log = dict_chain_log;
2221            }
2222        }
2223        // upstream zstd `ZSTD_resolveRowMatchFinderMode` (zstd_compress.c:238-245):
2224        // the row matchfinder is used for greedy/lazy/lazy2 ONLY when
2225        // `windowLog > 14`; at or below that upstream runs the hash-chain
2226        // matcher (`ZSTD_HcFindBestMatch`). We previously hardcoded the Row
2227        // backend for these strategies regardless of window, sending every
2228        // small-window frame (hinted floor = windowLog 14, e.g. the small-4k/10k
2229        // fixtures) through Row where upstream uses HC. Match it: fall back to
2230        // the hash-chain matcher (lazy/greedy parse via `lazy_depth`) when the
2231        // resolved window is <= 14. The HC config is synthesised from the
2232        // level's RowConfig (HC and Row share the same cParams; only the
2233        // matchfinder differs) — `hash_log` / `chain_log` are
2234        // clamped to the (<= 14) window inside the HashChain reset arm, so the
2235        // nominal width here only sets the clamp ceiling.
2236        if params.search == super::strategy::SearchMethod::RowHash && params.window_log <= 14 {
2237            let row = params
2238                .row
2239                .expect("a RowHash level row must carry a RowConfig");
2240            params.search = super::strategy::SearchMethod::HashChain;
2241            // For a dict-bearing frame, downsize the synthesised HC logs to the
2242            // dictionary's content tier via `cdict_table_logs` (the same
2243            // correction the native HC dict-prime path applies above), so a dict
2244            // much smaller than the window doesn't prime a needlessly sparse
2245            // table. Row-finder levels are never BinaryTree, so `uses_bt = false`.
2246            //
2247            // Feed `cdict_table_logs` the UN-hinted base Row width, not the
2248            // resolved `row.hash_bits`: the latter is already source-capped on a
2249            // hinted reset (the `row_cap = table_log + 1` clamp), so passing it
2250            // here would double-cap exactly as the native HC dict path warns
2251            // above — a small hinted source with a large dictionary would
2252            // collapse the prepared table below what the dict needs.
2253            // `cdict_table_logs` only ever downsizes, so deriving the ceiling
2254            // from the un-hinted base (plus active public overrides) keeps the
2255            // dict-tier geometry intact. No source hint => `row.hash_bits` is
2256            // already the level's full width, so reuse it directly.
2257            let row_cdict_hash_bits = match dict_hint.filter(|&size| size > 0) {
2258                Some(_) => {
2259                    let mut base_params = Self::level_params(level, None);
2260                    if let Some(ov) = self.param_overrides
2261                        && !ov.is_empty()
2262                    {
2263                        apply_param_overrides(&mut base_params, &ov);
2264                    }
2265                    base_params
2266                        .row
2267                        .map_or(row.hash_bits, |base_row| base_row.hash_bits)
2268                }
2269                None => row.hash_bits,
2270            };
2271            // Row-backed levels carry only `hash_bits`; the HC chain table they
2272            // fall back to follows the upstream zstd cParams relationship `chainLog =
2273            // hashLog - 1` for every Row level (L6 c18 h19 .. L12 c22 h23, see
2274            // the ROW_L* tables). Synthesise the chain width as `hash_bits - 1`
2275            // so the dict path doesn't leave the chain table one bit too wide
2276            // (cdict_table_logs only downsizes, so passing the full hash width
2277            // for both would keep a 2x-too-large chain table on dict frames).
2278            // Raw `- 1` is underflow-safe: `hash_bits` is either a predefined
2279            // ROW_L* width (>= 19) or a public `hash_log` override, and the
2280            // override is range-validated to `ZSTD_HASHLOG_MIN = 6` at the
2281            // parameter API, so the value is always >= 6 here.
2282            //
2283            // A public `chain_log` override (#27) is dropped by the RowHash
2284            // override arm (Row has no chain table), but once this frame falls
2285            // back to HC the chain table is live and must honour it — mirror
2286            // the native HC dict path, which feeds the override-applied
2287            // `base_hc.chain_log` into `cdict_table_logs`. Use the explicit
2288            // override (also API-validated to ZSTD_CHAINLOG_MIN = 6) when set,
2289            // else the upstream zstd `hashLog - 1` relationship.
2290            let explicit_chain_log = self
2291                .param_overrides
2292                .filter(|ov| !ov.is_empty())
2293                .and_then(|ov| ov.chain_log)
2294                .map(|chain_log| chain_log as usize);
2295            let row_cdict_chain_bits = explicit_chain_log.unwrap_or(row_cdict_hash_bits - 1);
2296            let (mut hash_log, mut chain_log) = match dict_hint.filter(|&size| size > 0) {
2297                Some(dict_size) => cdict_table_logs(
2298                    params.window_log,
2299                    row_cdict_hash_bits,
2300                    row_cdict_chain_bits,
2301                    false,
2302                    dict_size,
2303                ),
2304                None => (
2305                    row.hash_bits,
2306                    explicit_chain_log.unwrap_or(row.hash_bits - 1),
2307                ),
2308            };
2309            // No-dict path: the HashChain reset arm only clamps the logs to the
2310            // window when `hinted`, but a public `window_log` override can lower
2311            // this level to <= 14 with no source hint — clamp the level's full
2312            // Row `hash_bits` to the window here too (upstream zstd `ZSTD_adjustCParams`:
2313            // hashLog <= windowLog + 1, chainLog <= windowLog) so a 16 KiB window
2314            // doesn't allocate Row-sized HC tables.
2315            if dict_hint.filter(|&size| size > 0).is_none() {
2316                let wlog = params.window_log as usize;
2317                hash_log = hash_log.min(wlog + 1);
2318                chain_log = chain_log.min(wlog);
2319            }
2320            params.hc = Some(HcConfig {
2321                hash_log,
2322                chain_log,
2323                search_depth: row.search_depth,
2324                target_len: row.target_len,
2325                search_mls: 4,
2326            });
2327            params.row = None;
2328        }
2329        let next_backend = params.backend();
2330        let max_window_size = 1usize << params.window_log;
2331        self.dictionary_retained_budget = 0;
2332        // Drop any frame-local borrowed staging so it can't leak across a
2333        // reset and misroute the next start/skip into borrowed dispatch.
2334        self.borrowed_pending = None;
2335        if self.active_backend() != next_backend {
2336            // Drain the outgoing backend's allocations into the shared
2337            // pool. The `match &mut self.storage { ... }` block runs to
2338            // completion before the assignment below replaces the
2339            // variant, so the inner state we just drained is dropped
2340            // with the old variant.
2341            match &mut self.storage {
2342                MatcherStorage::Simple(_m) => {
2343                    // FastKernelMatcher owns a flat Vec<u8> history
2344                    // and a Vec<u32> hash table — both drop with the
2345                    // variant assignment below, no per-block buffers
2346                    // to recycle into the driver pools. The
2347                    // assignment-replace path collapses to a noop
2348                    // pre-pass for this backend.
2349                }
2350                MatcherStorage::Dfast(m) => {
2351                    // Drop the long / short hash table allocations
2352                    // before calling `m.reset`. Without this prepass,
2353                    // `DfastMatchGenerator::reset` would `fill` both
2354                    // tables with `DFAST_EMPTY_SLOT` sentinels — wasted
2355                    // work given the next assignment to `self.storage`
2356                    // is about to drop `m` entirely. `reset` itself
2357                    // short-circuits on `if !self.tables.is_empty()`, so
2358                    // handing it an empty `Vec` skips the fill loop.
2359                    // Mirrors the pre-drain pattern in the HashChain
2360                    // arm below (and serves the same peak-memory
2361                    // purpose: release the table-allocation footprint
2362                    // before constructing the replacement variant).
2363                    m.tables = Vec::new();
2364                    m.reset();
2365                }
2366                MatcherStorage::Row(m) => {
2367                    m.row_heads = Vec::new();
2368                    m.row_positions = Vec::new();
2369                    m.row_tags = Vec::new();
2370                    m.reset();
2371                }
2372                MatcherStorage::HashChain(m) => {
2373                    // Release oversized tables when switching away from
2374                    // HashChain so Best's larger allocations don't persist.
2375                    // hash3_table must be released alongside the other
2376                    // two: BtUltra2's `1 << HC3_HASH_LOG` entries would
2377                    // otherwise stay pinned across the backend switch,
2378                    // even though no future caller of this backend will
2379                    // touch them.
2380                    m.table.hash_table = Vec::new();
2381                    m.table.chain_table = Vec::new();
2382                    m.table.hash3_table = Vec::new();
2383                    let vec_pool = &mut self.vec_pool;
2384                    m.reset(|mut data| {
2385                        data.resize(data.capacity(), 0);
2386                        vec_pool.push(data);
2387                    });
2388                }
2389            }
2390            // Swap in a fresh variant for the new backend. The previous
2391            // `storage` is dropped here.
2392            self.storage = match next_backend {
2393                super::strategy::BackendTag::Simple => {
2394                    // Per-level Fast cParams from resolve_level_params:
2395                    // Level(1) gets (hash_log=14, mls=7); Level(-7..=-1)
2396                    // get upstream zstd row-0 (hash_log=13, mls=7); Fastest /
2397                    // Uncompressed keep (hash_log=14, mls=6). See
2398                    // resolve_level_params for rationale.
2399                    let fast = params.fast.expect("Fast level row carries a FastConfig");
2400                    MatcherStorage::Simple(FastKernelMatcher::with_params(
2401                        params.window_log,
2402                        fast.hash_log,
2403                        fast.mls,
2404                        fast.step_size,
2405                    ))
2406                }
2407                super::strategy::BackendTag::Dfast => {
2408                    MatcherStorage::Dfast(DfastMatchGenerator::new(max_window_size))
2409                }
2410                super::strategy::BackendTag::Row => {
2411                    MatcherStorage::Row(RowMatchGenerator::new(max_window_size))
2412                }
2413                super::strategy::BackendTag::HashChain => {
2414                    MatcherStorage::HashChain(HcMatchGenerator::new(max_window_size))
2415                }
2416            };
2417        }
2418
2419        // Single source of truth: `LevelParams::strategy_tag` is the
2420        // authoritative mapping from `CompressionLevel` to strategy.
2421        // `storage.backend()` derives the parse family from the variant,
2422        // so there is no separate runtime tag that could drift against
2423        // `LEVEL_TABLE`.
2424        self.strategy_tag = params.strategy_tag;
2425        self.search = params.search;
2426        self.parse = params.parse();
2427        self.slice_size = self.base_slice_size.min(max_window_size);
2428        self.reported_window_size = max_window_size;
2429        let strategy_tag = self.strategy_tag;
2430        // Source-proportional table window for the backends whose hash-table
2431        // widths are recomputed here (Dfast / Row). Like the HC / Fast caps
2432        // in `adjust_params_for_source_size`, this sizes the internal tables
2433        // from the RAW source log (not the wire `window_log` floor) so a
2434        // small frame zeroes a small table; it never exceeds the real window.
2435        let table_window_size = match hint {
2436            Some(h) => {
2437                let raw_log = source_size_ceil_log(h);
2438                // Clamp the shift below the pointer width before `1usize <<`:
2439                // an oversized hint (>= 2^63 + 1, and on 32-bit usize any hint
2440                // >= 2^32) drives `raw_log` to 64 / >= 32, and the shift would
2441                // overflow (panic in debug, wrap to 0 in release) before the
2442                // `.min(max_window_size)` cap below could bound it. The min cap
2443                // still provides the real semantic window bound.
2444                let shift = raw_log.max(MIN_WINDOW_LOG).min(usize::BITS as u8 - 1);
2445                (1usize << shift).min(max_window_size)
2446            }
2447            None => max_window_size,
2448        };
2449        // The hint-dependent hash-table width the active backend applies, for
2450        // the primed-snapshot key. Dfast/Row compute it from `table_window_size`
2451        // below; HC/Fast leave it `0` because their widths live in `params`
2452        // (`hc.{hash,chain}_log` / `fast_hash_log`) — already part of the key.
2453        let mut resolved_table_bits: usize = 0;
2454        match &mut self.storage {
2455            MatcherStorage::Simple(m) => {
2456                // Per-level Fast cParams threaded from
2457                // resolve_level_params (see Simple-backend swap
2458                // arm above for the (level → params) mapping).
2459                let fast = params.fast.expect("Fast level row carries a FastConfig");
2460                // Same attach/copy split the dict-prime dispatch applies
2461                // below (`prime_with_dictionary`): only attach-mode dict
2462                // frames may keep the main table across the reset via an
2463                // epoch advance — copy-mode and no-dict frames must memset
2464                // it back to bias 0 for the raw-slice kernels.
2465                // `Some(0)` is "no dictionary" (the dict-sizing path above
2466                // filters it the same way): an empty dict primes nothing, so
2467                // an epoch-advance reset would preserve stale attach state
2468                // instead of clearing it.
2469                let dict_attach_epoch = matches!(dict_hint, Some(size) if size > 0)
2470                    && self.reset_dict_attach_ok
2471                    && self
2472                        .reset_size_log
2473                        .is_none_or(|log| log <= FAST_ATTACH_DICT_CUTOFF_LOG);
2474                // Copy-mode dictionary frame whose primed snapshot matches
2475                // this exact resolved shape: `restore_primed_dictionary`
2476                // (called right after this reset; the caller gates the
2477                // restore on the same size bucket and the restore re-checks
2478                // the same key) will `clone_from` the snapshot over this
2479                // matcher, replacing the table contents and bias wholesale —
2480                // the reset's full-table memset would be thrown away. The
2481                // key components mirror `reset_shape` below: Simple leaves
2482                // `resolved_table_bits` 0, never carries an LDM override,
2483                // and `fast_attach` is false in copy mode by construction.
2484                let table_overwritten_by_restore = matches!(dict_hint, Some(size) if size > 0)
2485                    && !dict_attach_epoch
2486                    && self.primed.as_ref().is_some_and(|(_, _, captured)| {
2487                        *captured
2488                            == PrimedKey {
2489                                level,
2490                                params,
2491                                table_bits: 0,
2492                                fast_attach: false,
2493                                ldm: None,
2494                            }
2495                    });
2496                // Cap `hash_log <= window_log + 1` (upstream zstd
2497                // `ZSTD_adjustCParams_internal`): once `window_log` is resized
2498                // down for a small source, a level-default `1 << hash_log`
2499                // table is mostly wasted address space whose per-frame memset
2500                // dominates the compress cost on tiny frames (a 4 KB frame at
2501                // window_log 12 still zero-fills the 64 KiB hash_log-14 table).
2502                // Gated to no-dict frames: the dict-attach path shares one
2503                // hash_log between the main and dict tables (so one hash keys
2504                // both), and shrinking only the main table would break that
2505                // invariant and the small-frame dict ratio.
2506                let hash_log = if dict_hint.is_some_and(|s| s > 0) {
2507                    fast.hash_log
2508                } else {
2509                    fast.hash_log.min(params.window_log as u32 + 1)
2510                };
2511                m.reset(
2512                    params.window_log,
2513                    hash_log,
2514                    fast.mls,
2515                    fast.step_size,
2516                    dict_attach_epoch,
2517                    table_overwritten_by_restore,
2518                );
2519            }
2520            MatcherStorage::Dfast(dfast) => {
2521                dfast.max_window_size = max_window_size;
2522                let dcfg = params
2523                    .dfast
2524                    .expect("Dfast level row must carry a DfastConfig");
2525                // Upstream zstd `cParams.hashLog`/`chainLog`, capped by the
2526                // source-size window when hinted so tiny inputs don't
2527                // over-allocate.
2528                let long_bits = if hinted {
2529                    dfast_hash_bits_for_window(table_window_size).min(dcfg.long_hash_log as usize)
2530                } else {
2531                    dcfg.long_hash_log as usize
2532                };
2533                let short_bits = if hinted {
2534                    dfast_hash_bits_for_window(table_window_size).min(dcfg.short_hash_log as usize)
2535                } else {
2536                    dcfg.short_hash_log as usize
2537                };
2538                resolved_table_bits = long_bits;
2539                dfast.set_hash_bits(long_bits, short_bits);
2540                // Dfast holds no per-block input Vecs (history owns the
2541                // bytes and `add_data` returns each Vec eagerly), so
2542                // `reset` takes no `reuse_space` callback.
2543                dfast.reset();
2544            }
2545            MatcherStorage::Row(row) => {
2546                row.max_window_size = max_window_size;
2547                row.lazy_depth = params.lazy_depth;
2548                let mut row_cfg = params.row.expect("Row level row carries a RowConfig");
2549                if hinted {
2550                    // Clamp the configured hash width by the hinted window
2551                    // (upstream zstd `ZSTD_adjustCParams` caps hashLog by windowLog) —
2552                    // `min`, not replace, so an explicit `hash_log` param
2553                    // override (`row_cfg.hash_bits`) survives the hinted path
2554                    // instead of being overwritten by the window value.
2555                    //
2556                    // Clamp BEFORE `configure` so the backend sees ONE width
2557                    // per frame. Configuring with the unclamped level width
2558                    // and then re-clamping made `row_hash_log` oscillate on
2559                    // every hinted frame, and each width change clears the
2560                    // row tables — `ensure_tables` then re-filled all three
2561                    // every frame in a reused compressor.
2562                    row_cfg.hash_bits = row_cfg
2563                        .hash_bits
2564                        .min(row_hash_bits_for_window(table_window_size));
2565                }
2566                row.configure(row_cfg);
2567                // Key the primed snapshot on the width the backend ACTUALLY
2568                // applied (`set_hash_bits` clamps the request): recording the
2569                // request — or the 0 default on the unhinted path — keys
2570                // identical table geometries apart and forces needless
2571                // dictionary re-primes.
2572                resolved_table_bits = row.hash_bits();
2573                row.reset();
2574            }
2575            MatcherStorage::HashChain(hc) => {
2576                hc.table.max_window_size = max_window_size;
2577                hc.hc.lazy_depth = params.lazy_depth;
2578                let mut hc_cfg = params.hc.expect("HashChain level row carries an HcConfig");
2579                // Cap the hash / chain table logs by the hinted window so a small
2580                // input doesn't allocate the full level's tables (the upstream zstd
2581                // `ZSTD_adjustCParams_internal` clamp: `hashLog <= windowLog + 1`,
2582                // and `cycleLog <= windowLog` — `cycleLog == chainLog` for the HC
2583                // finder, `chainLog - 1` for the BT pair table, so `chainLog <=
2584                // windowLog` (+1 for BT)). Ratio-neutral: a hinted window of
2585                // `2^wlog` bytes holds at most `2^wlog` positions, so the slots
2586                // beyond that are never populated — capping only sheds unused
2587                // allocation. Was the source of L10-lazy peak-alloc ~2.15x the
2588                // upstream zstd on a 1 MiB input. Only applied when hinted; an
2589                // unknown-size stream keeps the full level tables.
2590                // Skip for dict-bearing frames: their `hc_cfg.{hash,chain}_log`
2591                // were already sized to the dictionary content tier via
2592                // `cdict_table_logs` (the dict supplies the long-distance
2593                // matches, so upstream `ZSTD_createCDict` sizes the prepared
2594                // tables to the dict, not the source window). Re-applying the
2595                // source-window cap here would collapse those dict-tier logs
2596                // back to the small hinted source — the same double-cap the
2597                // synthesis sites avoid by using the un-hinted base width.
2598                if hinted && !matches!(dict_hint, Some(size) if size > 0) {
2599                    let wlog = hc_hash_bits_for_window(table_window_size);
2600                    let uses_bt = matches!(
2601                        strategy_tag,
2602                        super::strategy::StrategyTag::Btlazy2
2603                            | super::strategy::StrategyTag::BtOpt
2604                            | super::strategy::StrategyTag::BtUltra
2605                            | super::strategy::StrategyTag::BtUltra2
2606                    );
2607                    hc_cfg.hash_log = hc_cfg.hash_log.min(wlog + 1);
2608                    hc_cfg.chain_log = hc_cfg.chain_log.min(if uses_bt { wlog + 1 } else { wlog });
2609                }
2610                hc.configure(hc_cfg, strategy_tag, params.window_log);
2611                let vec_pool = &mut self.vec_pool;
2612                hc.reset(|mut data| {
2613                    data.resize(data.capacity(), 0);
2614                    vec_pool.push(data);
2615                });
2616                // When the source size is known, pre-size the history mirror to
2617                // the expected total (dictionary + payload) so per-block growth
2618                // does not overshoot via Vec capacity doubling (upstream zstd sizes its
2619                // window buffer exactly). Dominates peak once the match-finder
2620                // tables are dictionary-tier-small. Unhinted streams skip this
2621                // and keep doubling growth.
2622                if let Some(src) = hint {
2623                    // `src` is a u64 hint and may be the u64::MAX "unknown
2624                    // size" sentinel, which truncates under `as usize` on
2625                    // 32-bit targets and overflows when the dict hint is
2626                    // added. Saturate the source size, then saturate the
2627                    // dict-hint addition; `reserve_history` applies the
2628                    // tighter window ceiling to the result.
2629                    let src_hint = usize::try_from(src).unwrap_or(usize::MAX);
2630                    let expected = src_hint.saturating_add(dict_hint.unwrap_or(0));
2631                    hc.table.reserve_history(expected);
2632                }
2633            }
2634        }
2635        // LDM wiring (#27): attach (or clear) the long-distance-match
2636        // producer on the optimal (BT) backend. LDM is the only
2637        // back-reference path that crosses the regular window, so it
2638        // only has a home on the `BtMatcher`; non-BT strategies drop the
2639        // producer. Built AFTER `hc.reset()` because `BtMatcher::reset`
2640        // clears an existing producer's table but does not null the
2641        // slot — installing here gives the new frame a fresh producer.
2642        #[cfg(feature = "hash")]
2643        if let MatcherStorage::HashChain(hc) = &mut self.storage {
2644            let producer = self
2645                .param_overrides
2646                .as_ref()
2647                .and_then(|ov| ov.ldm)
2648                .map(|ldm_ov| {
2649                    let strategy_ord = ldm_strategy_ordinal(params.strategy_tag, params.lazy_depth);
2650                    // Seed the caller-pinned knobs, then run the upstream zstd
2651                    // derivation over the seed so the remaining (zero)
2652                    // fields are filled with cross-field consistency
2653                    // (e.g. `hash_rate_log = window_log - hash_log`).
2654                    // Clobbering after `adjust_for` would break that and
2655                    // hand the producer an inconsistent set.
2656                    let seed = super::ldm::params::LdmParams {
2657                        window_log: params.window_log as u32,
2658                        hash_log: ldm_ov.hash_log.unwrap_or(0),
2659                        hash_rate_log: ldm_ov.hash_rate_log.unwrap_or(0),
2660                        min_match_length: ldm_ov.min_match.unwrap_or(0),
2661                        bucket_size_log: ldm_ov.bucket_size_log.unwrap_or(0),
2662                    };
2663                    super::ldm::LdmProducer::new(seed.derive(strategy_ord))
2664                });
2665            hc.set_ldm_producer(producer);
2666        }
2667        // Record the resolved matcher shape for the primed-snapshot key. Captured
2668        // here (post-resolution, after the test-only param override) so the key
2669        // reflects exactly the geometry the restored `storage` must match. The
2670        // Fast attach-vs-copy mode is part of the shape ONLY for the Simple
2671        // backend (it decides the distinct dict-table shape that backend builds).
2672        // Dfast/Row/HashChain have their OWN attach/copy regimes, but this bit
2673        // models only the Fast table split; those backends are keyed by the
2674        // resolved matcher geometry instead, so folding the Fast bit into their
2675        // key would over-key identical resolved shapes. When it applies it
2676        // matches the decision `prime_with_dictionary` makes from the same
2677        // `reset_size_log`.
2678        let fast_attach = matches!(next_backend, super::strategy::BackendTag::Simple)
2679            && self.reset_dict_attach_ok
2680            && self
2681                .reset_size_log
2682                .is_none_or(|log| log <= FAST_ATTACH_DICT_CUTOFF_LOG);
2683        // The LDM override is part of the snapshot identity ONLY on the
2684        // optimal (BinaryTree) path: that is the only backend whose cloned
2685        // `storage` carries a `BtMatcher::ldm_producer`. On Fast / Dfast /
2686        // Row and lazy-HashChain resets the producer slot does not exist,
2687        // so folding the override there would over-key the snapshot and
2688        // force needless re-primes when LDM is toggled. Gated like
2689        // `fast_attach` (a key bit only participates where it changes the
2690        // cloned matcher shape).
2691        let active_ldm = if matches!(params.search, super::strategy::SearchMethod::BinaryTree) {
2692            self.param_overrides.and_then(|ov| ov.ldm)
2693        } else {
2694            None
2695        };
2696        self.reset_shape = Some((params, resolved_table_bits, fast_attach, active_ldm));
2697    }
2698
2699    fn dictionary_is_resident(&self) -> bool {
2700        match &self.storage {
2701            MatcherStorage::HashChain(hc) => hc.table.dict_resident,
2702            MatcherStorage::Simple(s) => s.dict_resident(),
2703            MatcherStorage::Dfast(d) => d.dict_resident(),
2704            _ => false,
2705        }
2706    }
2707
2708    fn reapply_resident_dictionary(&mut self, offset_hist: [u32; 3]) {
2709        // Same offset-history head as `prime_with_dictionary`, without the dict
2710        // commit / re-index (resident dict bytes + cached dms already in place).
2711        match self.active_backend() {
2712            super::strategy::BackendTag::Simple => {
2713                self.simple_mut().prime_offset_history(offset_hist)
2714            }
2715            super::strategy::BackendTag::Dfast => {
2716                self.dfast_matcher_mut().offset_hist = offset_hist
2717            }
2718            super::strategy::BackendTag::Row => self.row_matcher_mut().offset_hist = offset_hist,
2719            super::strategy::BackendTag::HashChain => {
2720                let matcher = self.hc_matcher_mut();
2721                matcher.table.offset_hist = offset_hist;
2722                matcher.table.mark_dictionary_primed();
2723            }
2724        }
2725        // Restore the retained-dictionary budget the per-frame `reset` cleared.
2726        // The matcher's `reset` re-inflated `max_window_size` by the resident
2727        // dict region (so the dict + next input both stay in the eviction band),
2728        // exactly as `prime_with_dictionary` does — but the resident path skips
2729        // that prime, so without this the driver-level budget stays 0 and
2730        // `retire_dictionary_budget` never shrinks the inflated window as input
2731        // evicts the dict. For HashChain (whose `window_low` is measured against
2732        // `max_window_size`), a stuck-inflated window would let a post-eviction
2733        // match exceed the frame header's base window and emit an over-window
2734        // offset. The inflation equals `max_window_size - base`, and
2735        // `reported_window_size` is the base `1 << window_log` set by `reset`.
2736        let base = self.reported_window_size;
2737        let inflated = match self.active_backend() {
2738            super::strategy::BackendTag::Simple => self.simple_mut().max_window_size,
2739            super::strategy::BackendTag::Dfast => self.dfast_matcher_mut().max_window_size,
2740            super::strategy::BackendTag::Row => self.row_matcher_mut().max_window_size,
2741            super::strategy::BackendTag::HashChain => self.hc_matcher_mut().table.max_window_size,
2742        };
2743        self.dictionary_retained_budget = inflated.saturating_sub(base);
2744    }
2745
2746    fn prime_with_dictionary(&mut self, dict_content: &[u8], offset_hist: [u32; 3]) {
2747        match self.active_backend() {
2748            super::strategy::BackendTag::Simple => {
2749                // Routes through prime_offset_history so BOTH
2750                // offset_hist (wire encoder) and rep[0..2] (kernel)
2751                // are updated atomically. Without this, the two
2752                // tracks drift after dict priming — kernel emits
2753                // repcode matches against stale FAST_INITIAL_REP
2754                // while the wire encoder uses the primed history,
2755                // producing divergent wire encoding (Copilot review
2756                // #15 on #216).
2757                self.simple_mut().prime_offset_history(offset_hist);
2758            }
2759            super::strategy::BackendTag::Dfast => {
2760                self.dfast_matcher_mut().offset_hist = offset_hist
2761            }
2762            super::strategy::BackendTag::Row => self.row_matcher_mut().offset_hist = offset_hist,
2763            super::strategy::BackendTag::HashChain => {
2764                let matcher = self.hc_matcher_mut();
2765                matcher.table.offset_hist = offset_hist;
2766                matcher.table.mark_dictionary_primed();
2767            }
2768        }
2769
2770        if dict_content.is_empty() {
2771            return;
2772        }
2773
2774        // Dictionary bytes should stay addressable until produced frame output
2775        // itself exceeds the live window size. We bump `max_window_size`
2776        // by the dictionary length so the eviction band keeps the
2777        // primed bytes in `history`.
2778        //
2779        // Cap: `with_params`/`reset` enforce `window_log <= 30` so the
2780        // eviction band `2 * max_window_size` stays below `u32::MAX`
2781        // with headroom for one MAX_BLOCK_SIZE pending block — the
2782        // kernel asserts `data.len() <= u32::MAX`. A large enough
2783        // dictionary could otherwise push `max_window_size` past
2784        // that ceiling via the `saturating_add` below and silently
2785        // re-introduce the same overflow the `window_log` cap was
2786        // designed to prevent. Clamp the post-priming size so the
2787        // doubled-band-plus-block invariant survives.
2788        use super::match_table::storage::MAX_PRIMED_WINDOW_SIZE;
2789
2790        // `requested_dict_budget` is what the caller asked for;
2791        // `base_max_window_size` snapshots the pre-priming cap so we
2792        // can compute how much window the cap actually GRANTED below.
2793        // The cap may clip the requested growth, in which case the
2794        // bookkeeping (`dictionary_retained_budget` retire path) must
2795        // track only the granted portion — otherwise
2796        // `retire_dictionary_budget()` would later reclaim more than
2797        // was actually added and shrink the matcher below its real
2798        // base window (and `cap = 2 * max_window_size` would shrink
2799        // with it, risking under-allocation on subsequent commits).
2800        // The `granted_retained_budget` calculation further below is
2801        // the load-bearing piece — see its block-level comment for
2802        // the post-clip / post-uncommitted-tail math.
2803        let requested_dict_budget = dict_content.len();
2804        let base_max_window_size = match self.active_backend() {
2805            super::strategy::BackendTag::Simple => self.simple_mut().max_window_size,
2806            super::strategy::BackendTag::Dfast => self.dfast_matcher_mut().max_window_size,
2807            super::strategy::BackendTag::Row => self.row_matcher_mut().max_window_size,
2808            super::strategy::BackendTag::HashChain => self.hc_matcher_mut().table.max_window_size,
2809        };
2810        match self.active_backend() {
2811            super::strategy::BackendTag::Simple => {
2812                let matcher = self.simple_mut();
2813                matcher.max_window_size = matcher
2814                    .max_window_size
2815                    .saturating_add(requested_dict_budget)
2816                    .min(MAX_PRIMED_WINDOW_SIZE);
2817            }
2818            super::strategy::BackendTag::Dfast => {
2819                let matcher = self.dfast_matcher_mut();
2820                matcher.max_window_size = matcher
2821                    .max_window_size
2822                    .saturating_add(requested_dict_budget)
2823                    .min(MAX_PRIMED_WINDOW_SIZE);
2824            }
2825            super::strategy::BackendTag::Row => {
2826                let matcher = self.row_matcher_mut();
2827                matcher.max_window_size = matcher
2828                    .max_window_size
2829                    .saturating_add(requested_dict_budget)
2830                    .min(MAX_PRIMED_WINDOW_SIZE);
2831            }
2832            super::strategy::BackendTag::HashChain => {
2833                let matcher = self.hc_matcher_mut();
2834                matcher.table.max_window_size = matcher
2835                    .table
2836                    .max_window_size
2837                    .saturating_add(requested_dict_budget)
2838                    .min(MAX_PRIMED_WINDOW_SIZE);
2839            }
2840        }
2841
2842        let mut start = 0usize;
2843        let mut committed_dict_budget = 0usize;
2844        // insert_position needs 4 bytes of lookahead for hashing;
2845        // backfill_boundary_positions re-visits tail positions once the
2846        // next slice extends history, but cannot hash <4 byte fragments.
2847        let min_primed_tail = match self.active_backend() {
2848            super::strategy::BackendTag::Simple => MIN_MATCH_LEN,
2849            super::strategy::BackendTag::Dfast
2850            | super::strategy::BackendTag::Row
2851            | super::strategy::BackendTag::HashChain => 4,
2852        };
2853        while start < dict_content.len() {
2854            let end = (start + self.slice_size).min(dict_content.len());
2855            if end - start < min_primed_tail {
2856                break;
2857            }
2858            // Stage the dict chunk WITHOUT `get_next_space`'s
2859            // `resize(slice_size, 0)` zero-fill: that memsets a full
2860            // block-sized buffer (up to ~128 KiB) every frame only to have it
2861            // `clear()`-ed and overwritten by the dict bytes on the very next
2862            // lines — pure waste (measured ~10% of the small dict encode).
2863            // Reuse a pooled buffer's capacity if one is free (the prime/skip
2864            // cycle recycles them back), else allocate exactly the chunk.
2865            // Mirrors upstream zstd, which references the CDict content rather
2866            // than zero-filling a fresh window per frame.
2867            let mut space = self.vec_pool.pop().unwrap_or_default();
2868            space.clear();
2869            space.extend_from_slice(&dict_content[start..end]);
2870            self.commit_space(space);
2871            self.skip_matching_for_dictionary_priming();
2872            committed_dict_budget += end - start;
2873            start = end;
2874        }
2875
2876        // Derive `granted_retained_budget` directly from the two real
2877        // bounds — bytes actually committed and bytes the cap allows
2878        // — instead of doing a cap-clip pass followed by an
2879        // uncommitted-tail subtract. Previous shape double-discounted
2880        // when the cap clipped: clip lost `(requested - allowed)`,
2881        // then tail-subtract lost ANOTHER `(requested - committed)`,
2882        // leaving `max_window_size` shy of the dictionary that was
2883        // actually retained (e.g. cap=900, committed=998, uncommitted=2
2884        // landed at granted=898 instead of the correct 900).
2885        let capped_retained_budget = MAX_PRIMED_WINDOW_SIZE.saturating_sub(base_max_window_size);
2886        let granted_retained_budget = committed_dict_budget.min(capped_retained_budget);
2887        let final_max_window_size = base_max_window_size.saturating_add(granted_retained_budget);
2888        match self.active_backend() {
2889            super::strategy::BackendTag::Simple => {
2890                self.simple_mut().max_window_size = final_max_window_size;
2891            }
2892            super::strategy::BackendTag::Dfast => {
2893                self.dfast_matcher_mut().max_window_size = final_max_window_size;
2894            }
2895            super::strategy::BackendTag::Row => {
2896                self.row_matcher_mut().max_window_size = final_max_window_size;
2897            }
2898            super::strategy::BackendTag::HashChain => {
2899                self.hc_matcher_mut().table.max_window_size = final_max_window_size;
2900            }
2901        }
2902        if granted_retained_budget > 0 {
2903            self.dictionary_retained_budget = self
2904                .dictionary_retained_budget
2905                .saturating_add(granted_retained_budget);
2906        }
2907        if self.active_backend() == super::strategy::BackendTag::HashChain {
2908            // Recompute the lazy-HC attach decision made per-chunk in
2909            // `skip_matching_for_dictionary_priming` (stable across the prime —
2910            // `reset_size_log` does not change here).
2911            //
2912            // The HC attach/copy mode is deliberately NOT folded into `PrimedKey`
2913            // (unlike Fast `fast_attach`). Fast attach builds a separate dict
2914            // table whose dimensions differ from the copy-mode live table, so a
2915            // cross-mode restore would install mismatched table geometry and the
2916            // encoder could search past the frame window (undecodable). The two
2917            // HC modes share identical window geometry: `max_window_size` and the
2918            // dictionary limit are both set ABOVE this branch (the same value in
2919            // either mode), and the live chain table dimensions come from the
2920            // resolved `params` the key already pins. The modes differ only in
2921            // WHERE the committed dict lives — a single-link `dms` (attach) vs
2922            // merged into the live chain (copy) — both producing valid matches at
2923            // in-window offsets. Upstream zstd makes the same observation: attach
2924            // (`ZSTD_resetCCtx_byAttachingCDict`) and copy
2925            // (`ZSTD_resetCCtx_byCopyingCDict`) both keep the caller's
2926            // `windowLog`; the choice is a memory/speed trade-off, not a wire
2927            // contract. So restoring an attach snapshot where this frame would
2928            // have copied (or vice versa) yields a decodable frame that may only
2929            // differ in which matches are found (ratio) — algorithmic freedom, not
2930            // a defect. Keying on the mode would instead force a re-prime across
2931            // the cutoff, re-adding the per-frame cost this snapshot path removes.
2932            //
2933            // In practice the public reuse path (`compress_independent_frame`)
2934            // only ever captures AND restores the COPY-mode snapshot — capture is
2935            // gated on the above-cutoff source size, so a restored frame always
2936            // matches the captured mode. `hc_dict_snapshot_reuse_roundtrips` pins
2937            // that same-mode reuse decodes; the driver-level cross-mode restore is
2938            // accepted (not refused) per
2939            // `primed_snapshot_fast_attach_does_not_over_key_non_simple_backends`.
2940            let attach = self.hc_dict_attach_mode();
2941            let table = &mut self.hc_matcher_mut().table;
2942            table.set_dictionary_limit_from_primed_bytes(committed_dict_budget);
2943            // Build the dictMatchState over the committed dict (front of history)
2944            // so `find_best_match` dual-probes it with its own compare budget —
2945            // but ONLY in ATTACH mode. BT/optimal attach → DUBT dms; lazy-HC
2946            // attach → single-link hash-chain dms. COPY mode (large known source,
2947            // both BT and lazy-HC) already merged the dict into the live tree /
2948            // chain in `skip_matching_for_dictionary_priming`, so it carries no
2949            // separate dms — drop any stale one.
2950            if !attach {
2951                table.dms.invalidate();
2952            } else if table.uses_bt {
2953                table.prime_dms_bt(committed_dict_budget);
2954            } else {
2955                table.prime_dms_hc(committed_dict_budget);
2956            }
2957        }
2958        // CDict-equivalent: now that every dict chunk is indexed, mark the
2959        // Fast-backend dict table primed so the next frame's re-prime reuses
2960        // it (skips the re-hash) while still re-committing the dict bytes to
2961        // history. No-op when the attach path built no table (copy mode or a
2962        // sub-8-byte dict) — `mark_dict_primed` self-guards on table presence.
2963        match self.active_backend() {
2964            super::strategy::BackendTag::Simple => self.simple_mut().mark_dict_primed(),
2965            super::strategy::BackendTag::Dfast => self.dfast_matcher_mut().mark_dict_primed(),
2966            super::strategy::BackendTag::Row => self.row_matcher_mut().mark_dict_primed(),
2967            _ => {}
2968        }
2969    }
2970
2971    fn restore_primed_dictionary(&mut self, level: super::CompressionLevel) -> bool {
2972        // Only the (storage, dictionary_retained_budget) pair is what
2973        // `prime_with_dictionary` writes; restoring them reproduces the
2974        // post-prime state exactly. Gated on the FULL resolved key (level + the
2975        // resolved `LevelParams` + the active backend's table width), not just
2976        // the level: `reset` resolves the hint into a window/table geometry, so a
2977        // same-level snapshot taken at a hint that resolved to a different shape
2978        // carries a `storage.max_window_size` / table dimensions that no longer
2979        // match this reset. Restoring it would let the encoder search past the
2980        // frame header's window (an undecodable match), so on a key mismatch we
2981        // refuse and the caller re-primes.
2982        let Some((params, table_bits, fast_attach, ldm)) = self.reset_shape else {
2983            return false;
2984        };
2985        let key = PrimedKey {
2986            level,
2987            params,
2988            table_bits,
2989            fast_attach,
2990            ldm,
2991        };
2992        let Some((snapshot, budget, captured_key)) = &self.primed else {
2993            return false;
2994        };
2995        if *captured_key != key {
2996            return false;
2997        }
2998        let budget = *budget;
2999        match (&mut self.storage, snapshot) {
3000            // Same-variant Fast restore: copy the snapshot into the retained
3001            // live storage. `clone_from` reuses the history / hash-table /
3002            // dict-table buffers, so this is the upstream zstd CDict table-copy
3003            // regime's cost (pure copies) instead of a full per-frame
3004            // allocation + copy + drop cycle.
3005            (MatcherStorage::Simple(live), MatcherStorage::Simple(snap)) => {
3006                live.clone_from(snap);
3007            }
3008            // Same-variant HC lazy/greedy restore (non-BT): the snapshot keeps
3009            // the full primed hash/chain tables (capture's non-BT full clone),
3010            // so `clone_from` reuses the live history/hash/chain/dms buffers in
3011            // place — upstream zstd reuses the CDict tables rather than reallocating
3012            // them. This is the per-frame allocate+copy+drop that dominated
3013            // small `compress-dict` HC frames (5-7x vs C). BT (`uses_bt`)
3014            // snapshots drop their live tables, so they stay on the realloc
3015            // path below.
3016            (MatcherStorage::HashChain(live), MatcherStorage::HashChain(snap))
3017                if !snap.table.uses_bt =>
3018            {
3019                live.table.clone_from(&snap.table);
3020                live.hc.clone_from(&snap.hc);
3021                live.strategy_tag = snap.strategy_tag;
3022                // backend is `HcBackend::Hc` (zero-sized) for non-BT levels;
3023                // the live one is already correct for this resolved key.
3024            }
3025            (live, snapshot_storage) => {
3026                let mut storage = snapshot_storage.clone();
3027                // This arm handles the binary-tree backend. In ATTACH mode the
3028                // snapshot was stored WITHOUT its live hash / chain / hash3
3029                // tables (they hold no dictionary entries — the dict lives in
3030                // `dms` + history; see `capture_primed_dictionary`), so
3031                // `ensure_tables` re-allocates them zeroed to the snapshot's
3032                // geometry, exactly reproducing the post-prime state (all
3033                // `HC_EMPTY`). In COPY mode the snapshot retained its FULL live
3034                // tree (the dict was merged into it, no `dms`), so the tables are
3035                // already present at the right length and `ensure_tables` — which
3036                // only allocates on a length mismatch — leaves them untouched.
3037                // Either way this is a full storage replace, so no stale
3038                // live-table entry from a prior frame can survive.
3039                if let MatcherStorage::HashChain(hc) = &mut storage {
3040                    hc.table.ensure_tables();
3041                }
3042                // The snapshot does not retain the LDM producer (it holds no
3043                // dict state; see `capture_primed_dictionary`). Carry over the
3044                // frame's freshly-reset producer — built this frame by `reset`
3045                // with the same params the snapshot key pins, and empty (no
3046                // input processed yet), so it is equivalent to the producer
3047                // the snapshot was captured with.
3048                #[cfg(feature = "hash")]
3049                {
3050                    let fresh_ldm = if let MatcherStorage::HashChain(hc) = live {
3051                        hc.take_ldm_producer()
3052                    } else {
3053                        None
3054                    };
3055                    if let MatcherStorage::HashChain(hc) = &mut storage {
3056                        hc.set_ldm_producer(fresh_ldm);
3057                    }
3058                }
3059                *live = storage;
3060            }
3061        }
3062        self.dictionary_retained_budget = budget;
3063        true
3064    }
3065
3066    fn capture_primed_dictionary(&mut self, level: super::CompressionLevel) {
3067        // No resolved shape means `reset` has not run for this frame — nothing
3068        // valid to key a snapshot on, so skip the capture.
3069        let Some((params, table_bits, fast_attach, ldm)) = self.reset_shape else {
3070            return;
3071        };
3072        let key = PrimedKey {
3073            level,
3074            params,
3075            table_bits,
3076            fast_attach,
3077            ldm,
3078        };
3079        // CDict-equivalent retained state. A binary-tree level in ATTACH mode
3080        // decouples the dictionary into `dms` (the upstream zstd `dictMatchState`); its
3081        // live hash / chain / hash3 tables carry NO dict entries
3082        // (`skip_matching_dict_bt` keeps the dict out of the live tree), so they
3083        // are pure zeros. Storing them in the snapshot wastes the full table
3084        // footprint (a second window-tier table set resident for the whole
3085        // compress). Instead, move the live tables OUT of the working storage,
3086        // clone only the dict-state (history + `dms` + window/offset/dict-limit),
3087        // then move the live tables back — the snapshot keeps just what upstream zstd's
3088        // CDict keeps, and `restore_primed_dictionary` re-allocates the zeroed
3089        // live tables. Every other case keeps the dict reachable through the live
3090        // structure, so the snapshot must retain the full tables (full clone):
3091        // lazy-HC attach (it DOES prime a hash-chain `dms`, but the live chain is
3092        // still the search structure, so the tables must travel) and COPY mode for
3093        // BOTH BT and lazy-HC (`dms` invalidated, dict merged into the live tree /
3094        // chain). `uses_bt && dms.is_primed()` is therefore the exact "decoupled"
3095        // signal — true only for the BT attach prime; lazy-HC attach primes `dms`
3096        // too but is intentionally NOT decoupled.
3097        let bt_decoupled = matches!(
3098            &self.storage,
3099            MatcherStorage::HashChain(hc) if hc.table.uses_bt && hc.table.dms.is_primed()
3100        );
3101        if bt_decoupled {
3102            let MatcherStorage::HashChain(hc) = &mut self.storage else {
3103                unreachable!("bt_decoupled implies HashChain storage");
3104            };
3105            let hash_table = core::mem::take(&mut hc.table.hash_table);
3106            let chain_table = core::mem::take(&mut hc.table.chain_table);
3107            let hash3_table = core::mem::take(&mut hc.table.hash3_table);
3108            // The LDM producer carries no dictionary state (LDM is not
3109            // dict-primed; its hash table is empty at capture), so it is not
3110            // retained either — `restore` reinstates the frame's freshly-reset
3111            // producer. Take it out so the clone does not duplicate its table.
3112            #[cfg(feature = "hash")]
3113            let ldm_producer = hc.take_ldm_producer();
3114            // Clone the dict-state-only storage (live tables now empty Vecs,
3115            // LDM producer detached).
3116            let snapshot = self.storage.clone();
3117            // Move the live tables (and LDM producer) back into the working storage.
3118            let MatcherStorage::HashChain(hc) = &mut self.storage else {
3119                unreachable!("storage variant is stable across the take/put");
3120            };
3121            hc.table.hash_table = hash_table;
3122            hc.table.chain_table = chain_table;
3123            hc.table.hash3_table = hash3_table;
3124            #[cfg(feature = "hash")]
3125            hc.set_ldm_producer(ldm_producer);
3126            self.primed = Some((snapshot, self.dictionary_retained_budget, key));
3127        } else {
3128            self.primed = Some((self.storage.clone(), self.dictionary_retained_budget, key));
3129        }
3130    }
3131
3132    fn invalidate_primed_dictionary(&mut self) {
3133        self.primed = None;
3134        // Drop the Fast-backend CDict-equivalent table cache too: it is keyed
3135        // to the dictionary being removed / replaced. Left in place, the next
3136        // same-params `reset` would retain it and the kernel would probe a
3137        // dict region whose bytes are no longer re-committed to history.
3138        match self.active_backend() {
3139            super::strategy::BackendTag::Simple => self.simple_mut().invalidate_dict_cache(),
3140            super::strategy::BackendTag::Dfast => self.dfast_matcher_mut().invalidate_dict_cache(),
3141            // Row keeps its attach index across frames (like Simple/Dfast),
3142            // so a dictionary swap must drop its cached dict rows too;
3143            // otherwise the next small/unknown-size frame reuses stale
3144            // attach state through `prime_dict_attach_current_block`.
3145            super::strategy::BackendTag::Row => self.row_matcher_mut().invalidate_dict_cache(),
3146            // The BT dms tree is keyed to the dict bytes; `prime_dms_bt`
3147            // skips the rebuild while its shape matches, so a swapped
3148            // dictionary of the same length would otherwise keep serving the
3149            // OLD dictionary's tree.
3150            super::strategy::BackendTag::HashChain => {
3151                self.hc_matcher_mut().table.dms.invalidate();
3152            }
3153        }
3154    }
3155
3156    fn seed_dictionary_entropy(
3157        &mut self,
3158        huff: Option<&crate::huff0::huff0_encoder::HuffmanTable>,
3159        ll: Option<&crate::fse::fse_encoder::FSETable>,
3160        ml: Option<&crate::fse::fse_encoder::FSETable>,
3161        of: Option<&crate::fse::fse_encoder::FSETable>,
3162    ) {
3163        if self.active_backend() == super::strategy::BackendTag::HashChain {
3164            self.hc_matcher_mut()
3165                .seed_dictionary_entropy(huff, ll, ml, of);
3166        }
3167    }
3168
3169    fn window_size(&self) -> u64 {
3170        self.reported_window_size as u64
3171    }
3172
3173    fn get_next_space(&mut self) -> Vec<u8> {
3174        if let Some(mut space) = self.vec_pool.pop() {
3175            if space.len() > self.slice_size {
3176                space.truncate(self.slice_size);
3177            }
3178            if space.len() < self.slice_size {
3179                space.resize(self.slice_size, 0);
3180            }
3181            return space;
3182        }
3183        alloc::vec![0; self.slice_size]
3184    }
3185
3186    fn get_last_space(&mut self) -> &[u8] {
3187        match &self.storage {
3188            MatcherStorage::Simple(m) => m.last_committed_space(),
3189            MatcherStorage::Dfast(m) => m.get_last_space(),
3190            MatcherStorage::Row(m) => m.get_last_space(),
3191            MatcherStorage::HashChain(m) => m.table.get_last_space(),
3192        }
3193    }
3194
3195    fn commit_space(&mut self, space: Vec<u8>) {
3196        let mut evicted_bytes = 0usize;
3197        // Split borrows manually so the `add_data` closures can write
3198        // into `vec_pool` while the backend itself holds an exclusive
3199        // borrow via `storage`. (Suffix-store recycling went away
3200        // with the legacy `MatchGenerator`; the FastKernelMatcher
3201        // arm below has no pool interaction.)
3202        let vec_pool = &mut self.vec_pool;
3203        match &mut self.storage {
3204            MatcherStorage::Simple(m) => {
3205                // FastKernelMatcher owns its history as a single
3206                // flat Vec<u8> and the hash table as a Vec<u32> —
3207                // neither recycles into the driver-side pools. The
3208                // eager pre-commit eviction inside
3209                // `FastKernelMatcher::accept_data` drops bytes when
3210                // accepting this block would push history past 2×
3211                // max_window_size; that delta is what feeds
3212                // `evicted_bytes` here via the `pre / post`
3213                // history-length comparison.
3214                let pre = m.history_len_for_eviction_accounting();
3215                m.accept_data(space);
3216                let post = m.history_len_for_eviction_accounting();
3217                // `accept_data` performs eager pre-commit window
3218                // eviction (so this `pre - post` delta correctly
3219                // feeds the dictionary-budget retire flow). See
3220                // `FastKernelMatcher::accept_data` for the
3221                // commit-time-visibility rationale (closes #216
3222                // CodeRabbit review #5 / Copilot review #1: without
3223                // eager eviction, the delta was always 0 and the
3224                // dict budget never retired, leaving max_window_size
3225                // inflated post-dict-prime → matcher could emit
3226                // offsets exceeding the frame header's window).
3227                evicted_bytes += pre.saturating_sub(post);
3228            }
3229            MatcherStorage::Dfast(m) => {
3230                // Dfast's `add_data` callback receives the INPUT
3231                // `Vec<u8>` for pool recycling (Dfast stores its
3232                // bytes in the contiguous `history` buffer, not in
3233                // per-block Vecs — there is no per-block buffer to
3234                // pop off and hand back). Counting `data.len()` as
3235                // evicted bytes would conflate "new bytes ingested"
3236                // with "old bytes evicted from window"; the two
3237                // happen to coincide when the previous window was
3238                // saturated and the new input fills it 1:1, but
3239                // diverge when the eviction pop-loop drops blocks
3240                // of a different size than the incoming input. The
3241                // `dictionary_retained_budget` retire decision
3242                // downstream then gets driven by inflated eviction
3243                // counts and shrinks `max_window_size` prematurely.
3244                //
3245                // Derive the real eviction delta from `window_size`
3246                // before/after the call. The pop loop inside
3247                // `add_data` decrements `window_size` by each
3248                // evicted block length and then the final
3249                // `extend_from_slice + push_back` adds `space_len`,
3250                // so `evicted = pre + space_len - post`.
3251                let pre = m.window_size;
3252                let space_len = space.len();
3253                m.add_data(space, |data| {
3254                    // Same per-block recycle as the HashChain arm: push
3255                    // the spent input buffer back as-is rather than
3256                    // zero-filling to capacity. `add_data` mirrors the
3257                    // bytes into `history` and calls this every block, so
3258                    // capacity-wide zeroing would be hot-path waste;
3259                    // `get_next_space` zeroes at most `slice_size` bytes
3260                    // when it later reuses the buffer.
3261                    vec_pool.push(data);
3262                });
3263                // Plain `+` (the `saturating_sub` floors at 0): `pre` + one
3264                // block are byte counts bounded by the window, no overflow.
3265                evicted_bytes += (pre + space_len).saturating_sub(m.window_size);
3266            }
3267            MatcherStorage::Row(m) => {
3268                // RowMatchGenerator::add_data recycles the *input* buffer
3269                // through this callback every commit (its bytes are mirrored
3270                // into `history`), not the evicted chunks. Derive the eviction
3271                // delta from `window_size` before/after — `evicted = pre +
3272                // space_len - post` — exactly like the Simple / HashChain arms.
3273                // Counting the callback argument as evicted would charge the
3274                // whole committed block as evicted and prematurely retire
3275                // dictionary budget on a window that evicts nothing.
3276                let pre = m.window_size;
3277                let space_len = space.len();
3278                m.add_data(space, |data| {
3279                    // Recycle the spent buffer as-is; `add_data` runs this for
3280                    // every committed block, so zero-filling to capacity here
3281                    // would be hot-path waste (`get_next_space` zeroes at most
3282                    // `slice_size` on reuse).
3283                    vec_pool.push(data);
3284                });
3285                // Plain `+` (the `saturating_sub` floors at 0): `pre` + one
3286                // block are byte counts bounded by the window, no overflow.
3287                evicted_bytes += (pre + space_len).saturating_sub(m.window_size);
3288            }
3289            MatcherStorage::HashChain(m) => {
3290                // MatchTable::add_data now recycles the *incoming* buffer
3291                // through `reuse_space` (its bytes are copied into the
3292                // contiguous `history` mirror), so the callback no longer
3293                // reports evicted chunks. Derive the eviction delta from
3294                // `window_size` before/after, exactly like the Simple arm:
3295                // `evicted = pre + space_len - post`.
3296                let pre = m.table.window_size;
3297                let space_len = space.len();
3298                m.table.add_data(space, |data| {
3299                    // Recycle the spent input buffer to the pool as-is.
3300                    // `add_data` runs this callback for every committed
3301                    // block (the bytes are mirrored into `history`), so
3302                    // growing the buffer to its full capacity here would
3303                    // zero the whole allocation on the hot path.
3304                    // `get_next_space` resizes a popped buffer to
3305                    // `slice_size` on demand, touching at most
3306                    // `slice_size` bytes — never the larger capacity the
3307                    // pool retains.
3308                    vec_pool.push(data);
3309                });
3310                // Plain `+` (the `saturating_sub` floors at 0): byte counts
3311                // bounded by the window, no overflow.
3312                evicted_bytes += (pre + space_len).saturating_sub(m.table.window_size);
3313            }
3314        }
3315        // Gate the second backend trim pass on actual budget
3316        // reclamation. Without it, every slice commit on the
3317        // no-dictionary / no-eviction path (the common case) would
3318        // run a backend `match` ladder + `trim_to_window` early-out
3319        // for no reason — `trim_after_budget_retire` only does
3320        // meaningful work when `retire_dictionary_budget` shrank
3321        // `max_window_size` enough to make the backend's
3322        // `window_size > max_window_size` invariant trigger
3323        // eviction.
3324        if self.retire_dictionary_budget(evicted_bytes) {
3325            self.trim_after_budget_retire();
3326        }
3327    }
3328
3329    fn start_matching(&mut self, mut handle_sequence: impl for<'a> FnMut(Sequence<'a>)) {
3330        use super::strategy::{self, StrategyTag};
3331        // Borrowed one-shot Fast path: if the frame driver staged a
3332        // block range via `set_borrowed_block`, scan it in place against
3333        // the borrowed window instead of the owned committed block. Only
3334        // the Simple backend is instrumented (the gate guarantees it),
3335        // and the stage is consumed so the next block re-stages.
3336        if let Some((block_start, block_end)) = self.borrowed_pending.take() {
3337            match self.active_backend() {
3338                super::strategy::BackendTag::Simple => {
3339                    let m = self.simple_mut();
3340                    if m.dict_is_attached() {
3341                        // Dict-attach borrowed scan: live matches read the
3342                        // borrowed input in place, dict matches read the
3343                        // committed dict prefix via the 2-segment counter.
3344                        m.start_matching_borrowed_dict(
3345                            block_start,
3346                            block_end,
3347                            &mut handle_sequence,
3348                        );
3349                    } else {
3350                        m.start_matching_borrowed(block_start, block_end, &mut handle_sequence);
3351                    }
3352                }
3353                super::strategy::BackendTag::Dfast => self
3354                    .dfast_matcher_mut()
3355                    .start_matching_borrowed(block_start, block_end, &mut handle_sequence),
3356                super::strategy::BackendTag::Row => {
3357                    // Same greedy/lazy parse split as the owned RowHash arm.
3358                    let greedy = self.parse == super::strategy::ParseMode::Greedy;
3359                    self.row_matcher_mut().start_matching_borrowed(
3360                        block_start,
3361                        block_end,
3362                        greedy,
3363                        &mut handle_sequence,
3364                    );
3365                }
3366                super::strategy::BackendTag::HashChain => match self.search {
3367                    super::strategy::SearchMethod::HashChain => self
3368                        .hc_matcher_mut()
3369                        .start_matching_lazy_borrowed(block_start, block_end, &mut handle_sequence),
3370                    super::strategy::SearchMethod::BinaryTree => {
3371                        // Run the SAME BT dispatch as the owned BinaryTree arm
3372                        // below — every BT body reads its range via
3373                        // current_block_range() and bytes via live_history()
3374                        // (borrowed-aware), so the staged block is scanned in
3375                        // place. The table was already staged by
3376                        // `set_borrowed_block` (the HashChain arm at the top of
3377                        // this file calls `table.stage_borrowed_block` with the
3378                        // same range, and `borrowed_pending` is set only there),
3379                        // so no re-stage is needed here.
3380                        // Only btlazy2 reaches the borrowed BinaryTree scan:
3381                        // `borrowed_supported()` keeps the optimal parsers
3382                        // (BtOpt/BtUltra/BtUltra2) on the owned path, and
3383                        // `set_borrowed_block` asserts that predicate before any
3384                        // range is staged, so an optimal strategy_tag can never
3385                        // arrive here.
3386                        match self.strategy_tag {
3387                            StrategyTag::Btlazy2 => self
3388                                .hc_matcher_mut()
3389                                .start_matching_btlazy2(&mut handle_sequence),
3390                            other => unreachable!(
3391                                "borrowed BinaryTree scan is only supported for Btlazy2, got {other:?}"
3392                            ),
3393                        }
3394                    }
3395                    other => {
3396                        unreachable!("HashChain backend with unexpected search {other:?}")
3397                    }
3398                },
3399            }
3400            return;
3401        }
3402        // Decoupled parse×search dispatch (fires once per block). The
3403        // search axis (`self.search`) picks the candidate-finding backend;
3404        // the parse axis (greedy vs lazy depth) is carried by the
3405        // backend's runtime `lazy_depth`, set per level at `reset()`.
3406        // The two are independent, so any parse can run on any search
3407        // backend. The `BinaryTree` arm still selects the opt `Strategy`
3408        // ZST off `strategy_tag` so `compress_block::<S>` keeps its
3409        // const-folded optimal-parser monomorphisation.
3410        use super::strategy::SearchMethod;
3411        match self.search {
3412            SearchMethod::Fast => {
3413                self.simple_mut().start_matching(&mut handle_sequence);
3414                self.recycle_simple_space();
3415            }
3416            SearchMethod::DoubleFast => {
3417                self.dfast_matcher_mut()
3418                    .start_matching(&mut handle_sequence);
3419            }
3420            SearchMethod::RowHash => {
3421                // Greedy parse (depth 0) = upstream zstd-greedy entry (default
3422                // `ip + 1` start, greedy repcode commit); lazy / lazy2 use
3423                // the `pick_lazy_match` lookahead entry (reads `lazy_depth`).
3424                // Both bare entries dispatch on `row_log` internally into the
3425                // const-`ROW_LOG` hot loop (upstream zstd per-rowLog variant table).
3426                let greedy = self.parse == super::strategy::ParseMode::Greedy;
3427                let row = self.row_matcher_mut();
3428                if greedy {
3429                    row.start_matching_greedy(&mut handle_sequence);
3430                } else {
3431                    row.start_matching(&mut handle_sequence);
3432                }
3433            }
3434            SearchMethod::HashChain => {
3435                // Greedy/lazy/lazy2 all flow through the lazy parser; it
3436                // reads `hc.lazy_depth` (0 = greedy commit).
3437                self.hc_matcher_mut()
3438                    .start_matching_lazy(&mut handle_sequence);
3439            }
3440            SearchMethod::BinaryTree => match self.strategy_tag {
3441                StrategyTag::Btlazy2 => self
3442                    .hc_matcher_mut()
3443                    .start_matching_btlazy2(&mut handle_sequence),
3444                StrategyTag::BtOpt => self.compress_block::<strategy::BtOpt>(&mut handle_sequence),
3445                StrategyTag::BtUltra => {
3446                    self.compress_block::<strategy::BtUltra>(&mut handle_sequence)
3447                }
3448                StrategyTag::BtUltra2 => {
3449                    self.compress_block::<strategy::BtUltra2>(&mut handle_sequence)
3450                }
3451                _ => unreachable!(
3452                    "SearchMethod::BinaryTree requires a BT strategy tag (Btlazy2/BtOpt/BtUltra/BtUltra2)"
3453                ),
3454            },
3455        }
3456    }
3457
3458    fn skip_matching(&mut self) {
3459        self.skip_matching_with_hint(None);
3460    }
3461
3462    fn skip_matching_with_hint(&mut self, incompressible_hint: Option<bool>) {
3463        // Borrowed one-shot Fast path: a staged block range routes to the
3464        // borrowed skip (records the range for `get_last_space`, primes
3465        // hashes on the dict-priming hint) with no owned-history append
3466        // and nothing to recycle. Stage is consumed.
3467        if let Some((block_start, block_end)) = self.borrowed_pending.take() {
3468            match self.active_backend() {
3469                super::strategy::BackendTag::Simple => self.simple_mut().skip_matching_borrowed(
3470                    block_start,
3471                    block_end,
3472                    incompressible_hint,
3473                ),
3474                super::strategy::BackendTag::Dfast => self
3475                    .dfast_matcher_mut()
3476                    .skip_matching_borrowed(block_start, block_end, incompressible_hint),
3477                super::strategy::BackendTag::Row => self.row_matcher_mut().skip_matching_borrowed(
3478                    block_start,
3479                    block_end,
3480                    incompressible_hint,
3481                ),
3482                super::strategy::BackendTag::HashChain => self
3483                    .hc_matcher_mut()
3484                    .skip_matching_borrowed(block_start, block_end, incompressible_hint),
3485            }
3486            return;
3487        }
3488        match self.active_backend() {
3489            super::strategy::BackendTag::Simple => {
3490                self.simple_mut()
3491                    .skip_matching_with_hint(incompressible_hint);
3492                self.recycle_simple_space();
3493            }
3494            super::strategy::BackendTag::Dfast => {
3495                self.dfast_matcher_mut().skip_matching(incompressible_hint)
3496            }
3497            super::strategy::BackendTag::Row => self
3498                .row_matcher_mut()
3499                .skip_matching_with_hint(incompressible_hint),
3500            super::strategy::BackendTag::HashChain => {
3501                self.hc_matcher_mut().skip_matching(incompressible_hint)
3502            }
3503        }
3504    }
3505}
3506
3507impl MatchGeneratorDriver {
3508    /// Monomorphised optimal-parser entry point. Only the `BinaryTree`
3509    /// search arm of [`Matcher::start_matching`] routes here, selecting
3510    /// the concrete opt `S: Strategy` (BtOpt / BtUltra / BtUltra2) off
3511    /// `strategy_tag`, so the optimiser keeps the cost-model predicates
3512    /// (`S::USE_BT` / `S::USE_HASH3` / `S::ACCURATE_PRICE` /
3513    /// `S::TWO_PASS_SEED`) const-folded per strategy. The non-opt search
3514    /// backends (Fast / DoubleFast / RowHash / HashChain) are dispatched
3515    /// directly off the search axis and never reach this method, so all
3516    /// strategies arriving here are HashChain-backed.
3517    fn compress_block<S: super::strategy::Strategy>(
3518        &mut self,
3519        handle_sequence: &mut impl for<'a> FnMut(Sequence<'a>),
3520    ) {
3521        debug_assert_eq!(S::BACKEND, super::strategy::BackendTag::HashChain);
3522        debug_assert!(
3523            S::USE_BT,
3524            "compress_block only handles the optimal (BT) path"
3525        );
3526        self.hc_matcher_mut()
3527            .start_matching_strategy::<S>(handle_sequence);
3528    }
3529}
3530
3531/// Stage D: backend storage discriminator.
3532///
3533/// HC (lazy / lazy2) modes carry no extra per-frame state beyond the
3534/// shared `MatchTable` and `HcMatcher` runtime knobs, so the
3535/// [`HcBackend::Hc`] variant is zero-sized — no BT scratch is
3536/// allocated. BT-flavoured modes (`btopt` / `btultra` / `btultra2`)
3537/// hold the full [`super::bt::BtMatcher`] inside the
3538/// [`HcBackend::Bt`] variant (cost model, optimal-parser scratch
3539/// arenas, LDM candidate buffer).
3540///
3541/// The discriminator lives next to `parse_mode` so `configure()` can
3542/// promote between the two on a level change without touching the
3543/// `MatchTable` storage.
3544#[derive(Clone)]
3545pub(crate) enum HcBackend {
3546    /// Lazy / lazy2 modes — no per-frame backend state.
3547    Hc,
3548    /// BT-driven modes — owns the optimal parser's per-frame scratch.
3549    /// Boxed so the enum stays pointer-sized: HC-only matchers pay
3550    /// just the `Box`-niche, not the 4 KiB `BtMatcher` payload.
3551    Bt(alloc::boxed::Box<super::bt::BtMatcher>),
3552}
3553
3554impl HcBackend {
3555    /// Heap bytes held by the backend. `Hc` is zero-sized; `Bt` boxes a
3556    /// `BtMatcher`, so count the boxed payload plus its own scratch heap.
3557    fn heap_size(&self) -> usize {
3558        match self {
3559            Self::Hc => 0,
3560            Self::Bt(bt) => core::mem::size_of::<super::bt::BtMatcher>() + bt.heap_size(),
3561        }
3562    }
3563
3564    /// Mutable accessor on the BT matcher; panics if the active
3565    /// backend is `Hc`. The HC-or-Bt branches in orchestrator code use
3566    /// `let HcBackend::Bt(bt) = &self.backend` directly for readonly
3567    /// access — this helper exists so macro bodies that already drive
3568    /// a mutable BT update through the optimal parser can write
3569    /// `$self.backend.bt_mut().X` without an outer `match` ladder.
3570    #[inline(always)]
3571    pub(crate) fn bt_mut(&mut self) -> &mut super::bt::BtMatcher {
3572        match self {
3573            Self::Bt(bt) => bt,
3574            Self::Hc => unreachable!("BT-only accessor called in HC mode"),
3575        }
3576    }
3577}
3578
3579#[derive(Clone)]
3580struct HcMatchGenerator {
3581    /// Shared match-finder storage (window, history, hash / chain /
3582    /// hash3 tables, dictionary-priming flags). Used identically by HC
3583    /// and BT modes; backend-specific table interpretation lives in the
3584    /// matcher methods on this struct.
3585    table: super::match_table::storage::MatchTable,
3586    /// HC runtime knobs (lazy_depth, search_depth, target_len). Always
3587    /// present — BT modes still consult `hc.search_depth` for repcode
3588    /// probing and chain candidate enumeration.
3589    hc: super::hc::HcMatcher,
3590    /// Backend discriminator. [`HcBackend::Hc`] is zero-sized for the
3591    /// lazy / lazy2 path so HC-only generators don't carry the BT
3592    /// optimal-parser scratch buffers. [`HcBackend::Bt`] holds the
3593    /// `BtMatcher` when an optimal mode is configured.
3594    backend: HcBackend,
3595    /// Compile-time strategy tag mirrored from
3596    /// [`MatchGeneratorDriver::strategy_tag`] during `configure()`.
3597    /// The driver hot path never reads this — it dispatches to
3598    /// `compress_block::<S>` from its own tag — but the
3599    /// `#[cfg(test)] start_matching` helper consumes it so artificial
3600    /// test setups still pick the correct concrete `S` for the
3601    /// const-generic optimal parser (BtOpt vs BtUltra vs BtUltra2).
3602    /// Without this field the test path would have to collapse
3603    /// `BtOpt` and `BtUltra` onto the same monomorphisation since
3604    /// `table.uses_bt` / `table.is_btultra2` alone can't tell them
3605    /// apart.
3606    strategy_tag: super::strategy::StrategyTag,
3607}
3608
3609// Plain-data types relocated to [`crate::encoding::opt::types`] and
3610// [`crate::encoding::opt::ldm`] by #111 Phase 1. The use statements at
3611// the top of this file bring them back into scope so the existing
3612// methods on `HcMatchGenerator` compile unchanged.
3613
3614/// `bt_insert_step_no_rebase` body parameterized over the per-CPU
3615/// `count_match_from_indices` symbol. Each kernel-specific wrapper invokes
3616/// the macro with its own `fastpath::<kernel>::count_match_from_indices`
3617/// path so the call resolves inside the wrapper's `#[target_feature]`
3618/// umbrella and inlines instead of paying the function-call ABI per BT walk
3619/// iteration. Used only by `HcMatchGenerator` BT walk wrappers below.
3620///
3621/// Crate-private: the macro body references private `encoding::*`
3622/// modules via `$crate::...`, so it is unusable downstream and is
3623/// re-exported only inside this crate via `pub(crate) use` below.
3624macro_rules! bt_insert_step_no_rebase_body {
3625    ($table:expr, $search_depth:expr, $abs_pos:ident, $current_abs_end:ident, $target_abs:ident, $cmf:path) => {{
3626        let idx = $abs_pos - $table.history_abs_start;
3627        // Borrowed-aware live region (owned: `history[history_start..]`;
3628        // borrowed: the in-place input `[0, block_end)`). Reborrow-then-raw-ptr
3629        // so the slice holds NO borrow and coexists with the `&mut $table`
3630        // binary-tree writes below. Owned is byte-identical (same bytes).
3631        let concat: &[u8] = unsafe {
3632            let lh = $table.live_history();
3633            core::slice::from_raw_parts(lh.as_ptr(), lh.len())
3634        };
3635        if idx + 8 > concat.len() {
3636            return 1;
3637        }
3638        debug_assert!(
3639            $abs_pos <= $current_abs_end,
3640            "BT walker called past current block end"
3641        );
3642        let tail_limit = $current_abs_end - $abs_pos;
3643        let hash = $crate::encoding::match_table::storage::MatchTable::hash_position_at(
3644            concat,
3645            idx,
3646            $table.hash_log,
3647            $table.search_mls,
3648        );
3649        // Prefetch the hash bucket now. For the large L16+ hash table over
3650        // high-entropy input the bucket is L3/DRAM-cold, and unlike upstream's
3651        // monolithic ZSTD_btGetAllMatches (which overlaps this miss with its
3652        // inline rep/hash3 prologue) the read+write of `hash_table[hash]`
3653        // below is reached with nothing to hide it behind — it stalled a large
3654        // share of this function's cycles. Issuing the hint here lets the miss
3655        // overlap the address setup that follows.
3656        #[cfg(all(
3657            target_feature = "sse",
3658            any(target_arch = "x86", target_arch = "x86_64")
3659        ))]
3660        {
3661            #[cfg(target_arch = "x86")]
3662            use core::arch::x86::{_MM_HINT_T0, _mm_prefetch};
3663            #[cfg(target_arch = "x86_64")]
3664            use core::arch::x86_64::{_MM_HINT_T0, _mm_prefetch};
3665            // SAFETY: prefetch is a hint that never faults; `hash` indexes
3666            // `hash_table` directly below, so it is in bounds.
3667            unsafe {
3668                _mm_prefetch($table.hash_table.as_ptr().add(hash).cast(), _MM_HINT_T0);
3669            }
3670            // Prefetch the NEXT position's bucket too. The optimal-parser DP
3671            // advances one position per iteration, so this miss is issued a
3672            // full BT walk plus the next iteration's pre-collect work ahead of
3673            // the collect that will read it — far more lead than the same-call
3674            // hint above, enough to hide the full DRAM latency.
3675            if idx + 1 + 8 <= concat.len() {
3676                let hash_next =
3677                    $crate::encoding::match_table::storage::MatchTable::hash_position_at(
3678                        concat,
3679                        idx + 1,
3680                        $table.hash_log,
3681                        $table.search_mls,
3682                    );
3683                // SAFETY: prefetch never faults; an out-of-range index is a
3684                // harmless no-op hint.
3685                unsafe {
3686                    _mm_prefetch(
3687                        $table.hash_table.as_ptr().add(hash_next).cast(),
3688                        _MM_HINT_T0,
3689                    );
3690                }
3691            }
3692        }
3693        let Some(relative_pos) = $table.relative_position($abs_pos) else {
3694            return 1;
3695        };
3696        let stored = relative_pos + 1;
3697        let bt_mask = $table.bt_mask();
3698        // `abs_pos < bt_mask` legitimately happens for the first BT walk of
3699        // a fresh frame (bt_low effectively "no floor"). Saturating keeps
3700        // the floor at 0 so the `candidate_abs <= bt_low` check never
3701        // triggers early; raw subtraction would underflow into a huge
3702        // sentinel that ALWAYS triggers.
3703        let bt_low = $abs_pos.saturating_sub(bt_mask);
3704        // Hoist the BT pointer-pair base out of `self` once — see the
3705        // collect-matches body for the full rationale (per-step Vec reload +
3706        // bounds check through `&mut self` vs the upstream zstd's raw `U32*` walk).
3707        let chain_ptr = $table.chain_table.as_mut_ptr();
3708        debug_assert_eq!($table.chain_table.len(), 2 << $table.bt_log());
3709        let window_low = $table.window_low_abs_for_target($target_abs);
3710        // `abs_pos + 9` is safe in raw form: `MatchTable::add_data` caps
3711        // total input at `usize::MAX - STREAM_ABS_HEADROOM` (where
3712        // `STREAM_ABS_HEADROOM = HC_OPT_NUM + 16`), so every
3713        // frame-lifetime absolute cursor passed to the BT walker stays
3714        // below `usize::MAX - 9` regardless of stream length or
3715        // pointer width. The guard is hoisted to the data-ingest
3716        // boundary so this per-position site pays zero arithmetic
3717        // overhead in the hot loop.
3718        let mut match_end_abs = $abs_pos + 9;
3719        let mut best_len = 8usize;
3720        let mut compares_left = $search_depth;
3721        let mut common_length_smaller = 0usize;
3722        let mut common_length_larger = 0usize;
3723        let pair_idx = $table.bt_pair_index_for_abs($abs_pos);
3724        let mut smaller_slot = pair_idx;
3725        let mut larger_slot = pair_idx + 1;
3726        let mut match_stored = $table.hash_table[hash];
3727        $table.hash_table[hash] = stored;
3728
3729        while compares_left > 0 {
3730            if match_stored == $crate::encoding::match_table::storage::HC_EMPTY {
3731                break;
3732            }
3733            // Reject stale post-rebase slots whose pre-shift position is below
3734            // `index_shift` explicitly. A `wrapping_sub` maps such a slot to a
3735            // near-`usize::MAX` value that the `>= abs_pos` test only rejects
3736            // while `abs_pos` is far from the integer ceiling; on a
3737            // long-running rebased stream (reachable on 32-bit) `abs_pos` can
3738            // approach the ceiling and the wrapped value can land back inside
3739            // `[window_low, abs_pos)`. `checked_sub` ends the walk on the
3740            // underflow instead. `match_stored != HC_EMPTY` here, so the `- 1`
3741            // cannot underflow.
3742            let Some(candidate_abs) = ($table.position_base + (match_stored as usize - 1))
3743                .checked_sub($table.index_shift)
3744            else {
3745                break;
3746            };
3747            if candidate_abs < window_low || candidate_abs >= $abs_pos {
3748                break;
3749            }
3750            compares_left -= 1;
3751
3752            let next_pair_idx = $table.bt_pair_index_for_abs(candidate_abs);
3753            // SAFETY: `next_pair_idx (+1)` = `2*(candidate_abs & bt_mask) (+1)`
3754            // ≤ `chain_table.len()-1`; `chain_ptr` is the hoisted live base,
3755            // table not realloc'd during the walk.
3756            let next_smaller = unsafe { *chain_ptr.add(next_pair_idx) };
3757            let next_larger = unsafe { *chain_ptr.add(next_pair_idx + 1) };
3758            let seed_len = common_length_smaller.min(common_length_larger);
3759            let candidate_idx = candidate_abs - $table.history_abs_start;
3760            // SAFETY: BT walk invariant — `candidate_idx + tail_limit ≤
3761            // concat.len()` since the candidate is within
3762            // `[history_abs_start, abs_pos)` and `tail_limit ≤
3763            // current_abs_end - abs_pos`.
3764            let match_len = unsafe { $cmf(concat, idx, candidate_idx, tail_limit, seed_len) };
3765
3766            if match_len > best_len {
3767                best_len = match_len;
3768                // `candidate_abs + match_len <= current_abs_end` by BT walk
3769                // invariant — `match_len <= tail_limit = current_abs_end -
3770                // abs_pos` and `candidate_abs < abs_pos`.
3771                let candidate_end = candidate_abs + match_len;
3772                if candidate_end > match_end_abs {
3773                    match_end_abs = candidate_end;
3774                }
3775            }
3776
3777            if match_len >= tail_limit {
3778                break;
3779            }
3780
3781            let candidate_next = candidate_idx + match_len;
3782            let current_next = idx + match_len;
3783            // SAFETY: first-differing positions after a match_len-long prefix;
3784            // match_len < tail_limit (break above) + BT-walk bound
3785            // idx/candidate_idx + tail_limit <= concat.len() keep both in range.
3786            if unsafe {
3787                *concat.get_unchecked(candidate_next) < *concat.get_unchecked(current_next)
3788            } {
3789                // SAFETY: `smaller_slot` holds a valid pair index (init
3790                // `pair_idx`, updated to `next_pair_idx + 1`); the `usize::MAX`
3791                // sentinel is set only just before `break`, never written here.
3792                unsafe { *chain_ptr.add(smaller_slot) = match_stored };
3793                common_length_smaller = match_len;
3794                if candidate_abs <= bt_low {
3795                    smaller_slot = usize::MAX;
3796                    break;
3797                }
3798                smaller_slot = next_pair_idx + 1;
3799                match_stored = next_larger;
3800            } else {
3801                // SAFETY: as above for `larger_slot`.
3802                unsafe { *chain_ptr.add(larger_slot) = match_stored };
3803                common_length_larger = match_len;
3804                if candidate_abs <= bt_low {
3805                    larger_slot = usize::MAX;
3806                    break;
3807                }
3808                larger_slot = next_pair_idx;
3809                match_stored = next_smaller;
3810            }
3811        }
3812
3813        // SAFETY: both slots, when not the `usize::MAX` sentinel, hold valid
3814        // pair indices into the hoisted `chain_table` base.
3815        if smaller_slot != usize::MAX {
3816            unsafe {
3817                *chain_ptr.add(smaller_slot) = $crate::encoding::match_table::storage::HC_EMPTY
3818            };
3819        }
3820        if larger_slot != usize::MAX {
3821            unsafe {
3822                *chain_ptr.add(larger_slot) = $crate::encoding::match_table::storage::HC_EMPTY
3823            };
3824        }
3825
3826        let speed_positions = if best_len > 384 {
3827            (best_len - 384).min(192)
3828        } else {
3829            0
3830        };
3831        // `match_end_abs` is initialized to `abs_pos + 9` and is only
3832        // reassigned inside the `candidate_end > match_end_abs` branch
3833        // above. So even though an individual `candidate_end =
3834        // candidate_abs + match_len` can land below `abs_pos` (the
3835        // candidate sits earlier in history and the match runs short),
3836        // the variable itself never drops below its initial value.
3837        // That gives `match_end_abs ≥ abs_pos + 9 > abs_pos + 8` as a
3838        // loop-wide invariant, so the raw subtraction below cannot
3839        // underflow.
3840        speed_positions.max(match_end_abs - ($abs_pos + 8))
3841    }};
3842}
3843pub(crate) use bt_insert_step_no_rebase_body;
3844
3845/// `build_optimal_plan_impl` body parameterized over the per-CPU
3846/// `collect_optimal_candidates_initialized_<kernel>` method name. Caller
3847/// passes its `&mut self`, the seven DP entry-point arguments, and the
3848/// kernel-specific collect method. Each per-kernel wrapper invokes this
3849/// macro inside its own `#[target_feature]` umbrella so the per-position
3850/// `$collect` call inlines and the entire DP loop runs as one straight-line
3851/// hot path without an ABI barrier between the DP and the match-gathering
3852/// pipeline.
3853///
3854/// Body is ~730 lines but mechanically identical across kernels — the macro
3855/// keeps a single source of truth. The two const generics
3856/// (`ACCURATE_PRICE`, `FAVOR_SMALL_OFFSETS`) come from the wrapper's
3857/// generic parameter list and are referenced as bare identifiers; macro
3858/// hygiene resolves them at the expansion site.
3859/// Upstream zstd `offBase` for the btlazy2 lazy gain heuristic: a match whose offset
3860/// equals one of the three active repeat offsets prices as the cheap repcode
3861/// code (1/2/3); any other offset prices as `offset + 3`. So an equal-length
3862/// repeat-offset match always out-gains an explicit-offset one
3863/// (`zstd_lazy.c` `ZSTD_storeSeq` offBase convention).
3864#[inline]
3865fn btlazy2_offbase(offset: usize, reps: [u32; 3], ll0: bool) -> u32 {
3866    let o = offset as u32;
3867    // Upstream zstd repcode mapping shifts by `ll0` (zero-literal position): the cheap
3868    // codes become rep1 / rep2 / (rep0 - 1) instead of rep0 / rep1 / rep2,
3869    // because at ll0 an offset equal to rep0 is the special rep0-1 case, not
3870    // repcode 1. Scoring offsets against the wrong code at ll0 over-rewards a
3871    // rep0-distance match that does not actually encode as the cheapest code.
3872    if ll0 {
3873        if o == reps[1] {
3874            1
3875        } else if o == reps[2] {
3876            2
3877        } else if reps[0] > 1 && o == reps[0] - 1 {
3878            3
3879        } else {
3880            // Offsets are < window (<= 2^27), so `+ 3` never overflows u32.
3881            o + 3
3882        }
3883    } else if o == reps[0] {
3884        1
3885    } else if o == reps[1] {
3886        2
3887    } else if o == reps[2] {
3888        3
3889    } else {
3890        // Offsets are < window (<= 2^27), so `+ 3` never overflows u32.
3891        o + 3
3892    }
3893}
3894
3895/// Upstream zstd lazy match gain (`matchLength * 4 - ZSTD_highbit32(offBase)`): the
3896/// selection metric that lets a shorter repeat-offset match beat a longer
3897/// explicit-offset one. `offBase >= 1`, so `highbit` is well-defined.
3898#[inline]
3899fn btlazy2_gain(match_len: usize, offset: usize, reps: [u32; 3], ll0: bool) -> i64 {
3900    let offbase = btlazy2_offbase(offset, reps, ll0);
3901    (match_len as i64) * 4 - (31 - offbase.leading_zeros()) as i64
3902}
3903
3904/// Per-kernel body of the `btlazy2` (levels 13-15) greedy/lazy parse over
3905/// the binary-tree match finder. Mirrors `build_optimal_plan_impl_body!`'s
3906/// kernel-dispatch discipline: the wrapper carries the `#[target_feature]`
3907/// umbrella and passes its tier-specific `collect_optimal_candidates_initialized_<kernel>`
3908/// as `$collect`, so the per-position BT collect (and its inlined cpl)
3909/// stays under one umbrella — the runtime `select_kernel()` dispatch happens
3910/// ONCE per block in the bare `start_matching_btlazy2`, never per position.
3911macro_rules! start_matching_btlazy2_body {
3912    ($self:ident, $handle_sequence:ident, $collect:ident, $cmf:path $(,)?) => {{
3913        $self.table.ensure_tables();
3914        // Borrowed-aware: owned → last committed chunk; borrowed → staged block.
3915        let (current_abs_start, current_len) = $self.table.current_block_range();
3916        if current_len == 0 {
3917            return;
3918        }
3919        let current_ptr = $self.table.get_last_space().as_ptr();
3920        // Mutates tables but never reallocates `history`, so this tail slice
3921        // stays valid for the routine's duration (same as the other parsers).
3922        let current: &[u8] = unsafe { core::slice::from_raw_parts(current_ptr, current_len) };
3923        // Full contiguous live region (owned: dict + prior blocks + current
3924        // block in `history`; borrowed: `[0, block_end)` of the in-place
3925        // input) as a raw slice, for the explicit repcode probe: a rep offset
3926        // can point before the current block, which `current` can't reach.
3927        // `live_history()` is borrowed-aware; reborrow-then-raw-ptr so the
3928        // slice holds NO borrow and coexists with the `&mut self` collector
3929        // calls below. Same no-realloc validity contract as `current`.
3930        let history_abs_start = $self.table.history_abs_start;
3931        let concat_full: &[u8] = unsafe {
3932            let lh = $self.table.live_history();
3933            core::slice::from_raw_parts(lh.as_ptr(), lh.len())
3934        };
3935        let current_abs_end = current_abs_start + current_len;
3936        $self
3937            .table
3938            .apply_limited_update_after_long_match(current_abs_start);
3939        $self
3940            .table
3941            .backfill_boundary_positions(current_abs_start, current_abs_end);
3942
3943        let profile = HcOptimalCostProfile::const_for_strategy::<super::strategy::Btlazy2>();
3944        let mut candidates = core::mem::take(&mut $self.backend.bt_mut().opt_candidates_scratch);
3945
3946        let depth = $self.hc.lazy_depth as usize;
3947        let mut pos = 0usize;
3948        let mut literals_start = 0usize;
3949
3950        // Collect + select the highest-GAIN match at a position (upstream zstd
3951        // `ZSTD_searchMax` plus the explicit offset_1 repcode check): scan the
3952        // length-sorted BT/dms ladder by gain, then probe rep0 directly since
3953        // the ladder's strictly-increasing-length filter drops short cheap
3954        // reps. Expands to `(match_len, offset)`; `match_len == 0` = no match.
3955        macro_rules! bt_select {
3956            ($p:expr) => {{
3957                let sel_pos: usize = $p;
3958                // `ll0` (upstream zstd): zero literals pending before this position, so
3959                // the repcode set is shifted (see `btlazy2_offbase`).
3960                let ll0 = sel_pos == literals_start;
3961                let sel_abs = current_abs_start + sel_pos;
3962                candidates.clear();
3963                let query = HcCandidateQuery {
3964                    reps: $self.table.offset_hist,
3965                    lit_len: sel_pos - literals_start,
3966                    // No LDM seed: L13-15 run at windowLog 22, below upstream zstd's
3967                    // LDM auto-enable threshold (windowLog >= 27).
3968                    ldm_candidate: None,
3969                };
3970                // SAFETY: called inside the wrapper's `#[target_feature]`
3971                // umbrella (the scalar wrapper's `$collect` is a safe fn).
3972                unsafe {
3973                    $self.$collect::<super::strategy::Btlazy2, true>(
3974                        sel_abs,
3975                        current_abs_end,
3976                        profile,
3977                        query,
3978                        &mut candidates,
3979                    );
3980                }
3981                let reps = $self.table.offset_hist;
3982                let mut sel_ml = 0usize;
3983                let mut sel_off = 0usize;
3984                let mut sel_gain = i64::MIN;
3985                for c in candidates.iter() {
3986                    let ml = c.match_len.min(current_len - sel_pos);
3987                    if ml < HC_OPT_MIN_MATCH_LEN {
3988                        continue;
3989                    }
3990                    let g = btlazy2_gain(ml, c.offset, reps, ll0);
3991                    if g > sel_gain {
3992                        sel_gain = g;
3993                        sel_ml = ml;
3994                        sel_off = c.offset;
3995                    }
3996                }
3997                let sel_idx = sel_abs - history_abs_start;
3998                // Upstream zstd probes `rep[0 + ll0]` directly (the length-sorted ladder
3999                // drops short cheap reps): rep0 normally, rep1 at a zero-literal
4000                // position where rep0 is not the cheapest code.
4001                let probe_rep = if ll0 {
4002                    reps[1] as usize
4003                } else {
4004                    reps[0] as usize
4005                };
4006                if probe_rep != 0 && sel_idx >= probe_rep {
4007                    let tail = current_len - sel_pos;
4008                    // SAFETY: `sel_idx - probe_rep < sel_idx`, `sel_idx + tail <=
4009                    // concat_full.len()`; same overshoot slack the collector
4010                    // relies on for this block.
4011                    let rep_ml =
4012                        unsafe { $cmf(concat_full, sel_idx, sel_idx - probe_rep, tail, 0) };
4013                    if rep_ml >= HC_OPT_MIN_MATCH_LEN
4014                        && btlazy2_gain(rep_ml, probe_rep, reps, ll0) > sel_gain
4015                    {
4016                        sel_ml = rep_ml;
4017                        sel_off = probe_rep;
4018                    }
4019                }
4020                (sel_ml, sel_off)
4021            }};
4022        }
4023
4024        while pos + HC_OPT_MIN_MATCH_LEN <= current_len {
4025            let (mut best_ml, mut best_off) = bt_select!(pos);
4026            if best_ml < HC_OPT_MIN_MATCH_LEN {
4027                pos += 1;
4028                continue;
4029            }
4030            // Lazy lookahead (upstream zstd depth 1/2): advance one byte and accept the
4031            // later match only if it out-gains the current one by the upstream zstd
4032            // margin (deferring costs an extra literal — `+4` at depth 1, `+7`
4033            // at depth 2). `start` tracks where the chosen match begins.
4034            let mut start = pos;
4035            let mut d = 0usize;
4036            while d < depth && start + 1 + HC_OPT_MIN_MATCH_LEN <= current_len {
4037                let look = start + 1;
4038                let (ml2, off2) = bt_select!(look);
4039                if ml2 < HC_OPT_MIN_MATCH_LEN {
4040                    break;
4041                }
4042                let reps = $self.table.offset_hist;
4043                let margin = if d == 0 { 4 } else { 7 };
4044                // `best` sits at `start` (ll0 iff no literals precede it); the
4045                // lookahead match at `start + 1` always has a pending literal.
4046                let gain1 = btlazy2_gain(best_ml, best_off, reps, start == literals_start) + margin;
4047                let gain2 = btlazy2_gain(ml2, off2, reps, false);
4048                if gain2 > gain1 {
4049                    best_ml = ml2;
4050                    best_off = off2;
4051                    start = look;
4052                    d += 1;
4053                } else {
4054                    break;
4055                }
4056            }
4057            // Commit the chosen match at `start`; [literals_start, start) is
4058            // emitted as literals. `best_ml` was bounded to `current_len -
4059            // start` at selection, so `start + best_ml <= current_len`.
4060            let lit_len = start - literals_start;
4061            let literals = &current[literals_start..start];
4062            $handle_sequence(Sequence::Triple {
4063                literals,
4064                offset: best_off,
4065                match_len: best_ml,
4066            });
4067            let _ = encode_offset_with_history(
4068                best_off as u32,
4069                lit_len as u32,
4070                &mut $self.table.offset_hist,
4071            );
4072            pos = start + best_ml;
4073            literals_start = pos;
4074        }
4075
4076        if literals_start < current_len {
4077            $handle_sequence(Sequence::Literals {
4078                literals: &current[literals_start..],
4079            });
4080        }
4081        $self.backend.bt_mut().opt_candidates_scratch = candidates;
4082    }};
4083}
4084
4085/// 8-lane `next_cost < node_price` mask for the optimal-parser price-set
4086/// loop. AVX2 lacks an unsigned `cmplt`, so derive `nc < np` from
4087/// `min_epu32`: `nc <= np` iff `min(nc,np) == nc`, then exclude equality.
4088/// Returns a bitmask (bit `k` set => lane `k` improves). Scalar fallback
4089/// for non-x86 / no-AVX2.
4090/// 8-lane `next_cost < node_price` mask for the optimal-parser price-set
4091/// loop. AVX2 lacks an unsigned `cmplt`, so derive `nc < np` from
4092/// `min_epu32`: `nc <= np` iff `min(nc,np) == nc`, then exclude equality.
4093/// Returns a bitmask (bit `k` set => lane `k` improves). Compiled on every
4094/// x86 target (same as the avx2 collect kernel); the cargo `kernel_avx2`
4095/// feature only gates the runtime dispatch, not compilation.
4096#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
4097#[target_feature(enable = "avx2")]
4098unsafe fn priceset_improved_mask8_avx2(next_cost: &[u32; 8], node_price: &[u32]) -> u8 {
4099    #[cfg(target_arch = "x86")]
4100    use core::arch::x86::{
4101        __m256i, _mm256_andnot_si256, _mm256_castsi256_ps, _mm256_cmpeq_epi32, _mm256_loadu_si256,
4102        _mm256_min_epu32, _mm256_movemask_ps,
4103    };
4104    #[cfg(target_arch = "x86_64")]
4105    use core::arch::x86_64::{
4106        __m256i, _mm256_andnot_si256, _mm256_castsi256_ps, _mm256_cmpeq_epi32, _mm256_loadu_si256,
4107        _mm256_min_epu32, _mm256_movemask_ps,
4108    };
4109    let nc = unsafe { _mm256_loadu_si256(next_cost.as_ptr() as *const __m256i) };
4110    let np = unsafe { _mm256_loadu_si256(node_price.as_ptr() as *const __m256i) };
4111    let min = _mm256_min_epu32(nc, np);
4112    let le = _mm256_cmpeq_epi32(min, nc); // nc <= np
4113    let eq = _mm256_cmpeq_epi32(nc, np); // nc == np
4114    let lt = _mm256_andnot_si256(eq, le); // nc < np
4115    _mm256_movemask_ps(_mm256_castsi256_ps(lt)) as u8
4116}
4117
4118/// Inline `next_cost = base_cost + ll0_price + match_price_from_parts(off,ml)`
4119/// for one match length — the exact `add_prices` chain the scalar loop uses,
4120/// so the SoA vector path stays byte-identical.
4121#[inline(always)]
4122#[allow(clippy::too_many_arguments)]
4123fn priceset_next_cost(
4124    profile: HcOptimalCostProfile,
4125    stats: &HcOptState,
4126    ml_cache: &mut [[u32; 2]],
4127    ml_stamp: u32,
4128    match_len: usize,
4129    ll0_price: u32,
4130    off_price: u32,
4131    base_cost: u32,
4132) -> u32 {
4133    let ml_price =
4134        BtMatcher::cached_match_length_price(profile, stats, match_len, ml_cache, ml_stamp);
4135    let seq_cost = BtMatcher::add_prices(
4136        ll0_price,
4137        profile.match_price_from_parts(off_price, ml_price, stats),
4138    );
4139    BtMatcher::add_prices(base_cost, seq_cost)
4140}
4141
4142/// Scalar price-set over the match-length range `[start, max]` for the
4143/// NON-abort optimal modes (btultra / btultra2). Each `match_len` writes a
4144/// distinct node `pos + match_len`, so order is irrelevant; the improvement
4145/// test reduces to `next_cost < node_prices[next]` (`reset_opt_nodes` set
4146/// every beyond-frontier cell to `u32::MAX`, subsuming `next > last_pos`).
4147/// `#[inline]` so it folds into each per-tier optimal-parser monomorphisation
4148/// (no call overhead). Returns the highest written `next`.
4149#[inline]
4150#[allow(clippy::too_many_arguments)]
4151// Used by the scalar / sse42 DP wrappers; on aarch64 the dispatch only reaches
4152// the neon wrapper and on wasm+simd128 only the simd128 wrapper, so this is
4153// cfg-dead on those targets.
4154#[cfg_attr(
4155    any(
4156        all(target_arch = "aarch64", target_endian = "little"),
4157        all(target_arch = "wasm32", target_feature = "simd128")
4158    ),
4159    allow(dead_code)
4160)]
4161fn priceset_range_nonabort_scalar(
4162    node_prices: &mut [u32],
4163    nodes: &mut [HcOptimalNode],
4164    ml_cache: &mut [[u32; 2]],
4165    ml_stamp: u32,
4166    profile: HcOptimalCostProfile,
4167    stats: &HcOptState,
4168    pos: usize,
4169    start: usize,
4170    max: usize,
4171    ll0_price: u32,
4172    off_price: u32,
4173    base_cost: u32,
4174    off: u32,
4175    reps: [u32; 3],
4176    last_pos: usize,
4177) -> usize {
4178    let mut new_last = last_pos;
4179    for ml in start..=max {
4180        let next_cost = priceset_next_cost(
4181            profile, stats, ml_cache, ml_stamp, ml, ll0_price, off_price, base_cost,
4182        );
4183        let next = pos + ml;
4184        if next_cost < node_prices[next] {
4185            node_prices[next] = next_cost;
4186            nodes[next] = HcOptimalNode {
4187                off,
4188                mlen: ml as u32,
4189                litlen: 0,
4190                reps,
4191            };
4192            if next > new_last {
4193                new_last = next;
4194            }
4195        }
4196    }
4197    new_last
4198}
4199
4200/// Per-tier deinterleave + improve-mask correctness vs a scalar reference.
4201/// Each tier's dispatch only fires on matching hardware (i9 picks AVX2 over
4202/// SSE4.1, M1 picks NEON), so the non-dispatched tiers never run in the
4203/// roundtrip suite; this exercises the deinterleave/mask helpers directly on
4204/// whatever ISA the test host exposes (AVX2 + SSE4.1 on x86, NEON on aarch64).
4205#[cfg(test)]
4206#[test]
4207fn priceset_tier_helpers_match_scalar() {
4208    // Reference: gen-stamped contiguous cells -> ordered prices on all-warm.
4209    fn scalar_deint<const W: usize>(cells: &[[u32; 2]], stamp: u32) -> Option<[u32; W]> {
4210        let mut out = [0u32; W];
4211        for k in 0..W {
4212            if cells[k][1] != stamp {
4213                return None;
4214            }
4215            out[k] = cells[k][0];
4216        }
4217        Some(out)
4218    }
4219    fn scalar_mask<const W: usize>(nc: &[u32; W], np: &[u32]) -> u8 {
4220        let mut m = 0u8;
4221        for k in 0..W {
4222            if nc[k] < np[k] {
4223                m |= 1 << k;
4224            }
4225        }
4226        m
4227    }
4228    const S: u32 = 0x55;
4229    let warm: [[u32; 2]; 4] = [[11, S], [22, S], [33, S], [44, S]];
4230    let mut cold = warm;
4231    cold[2][1] = S ^ 1; // one stale cell -> must yield None
4232    let nc4: [u32; 4] = [10, 99, 30, 41];
4233    let np4: [u32; 4] = [20, 21, 30, 99]; // lt: lane0 (10<20), lane3 (41<99)
4234
4235    #[cfg(all(target_arch = "aarch64", target_endian = "little"))]
4236    unsafe {
4237        assert_eq!(
4238            priceset_cached_prices4_neon(&warm, S),
4239            scalar_deint::<4>(&warm, S)
4240        );
4241        assert_eq!(priceset_cached_prices4_neon(&cold, S), None);
4242        assert_eq!(
4243            priceset_improved_mask4_neon(&nc4, &np4),
4244            scalar_mask::<4>(&nc4, &np4)
4245        );
4246    }
4247    #[cfg(all(feature = "std", any(target_arch = "x86", target_arch = "x86_64")))]
4248    {
4249        if std::is_x86_feature_detected!("sse4.2") {
4250            unsafe {
4251                assert_eq!(
4252                    priceset_cached_prices4_sse41(&warm, S),
4253                    scalar_deint::<4>(&warm, S)
4254                );
4255                assert_eq!(priceset_cached_prices4_sse41(&cold, S), None);
4256                assert_eq!(
4257                    priceset_improved_mask4_sse41(&nc4, &np4),
4258                    scalar_mask::<4>(&nc4, &np4)
4259                );
4260            }
4261        }
4262        if std::is_x86_feature_detected!("avx2") {
4263            let warm8: [[u32; 2]; 8] = [
4264                [11, S],
4265                [22, S],
4266                [33, S],
4267                [44, S],
4268                [55, S],
4269                [66, S],
4270                [77, S],
4271                [88, S],
4272            ];
4273            let mut cold8 = warm8;
4274            cold8[5][1] = S ^ 1;
4275            let nc8: [u32; 8] = [10, 99, 30, 41, 99, 60, 99, 80];
4276            let np8: [u32; 8] = [20, 21, 30, 99, 50, 99, 70, 99];
4277            unsafe {
4278                assert_eq!(
4279                    priceset_cached_prices8_avx2(&warm8, S),
4280                    scalar_deint::<8>(&warm8, S)
4281                );
4282                assert_eq!(priceset_cached_prices8_avx2(&cold8, S), None);
4283                assert_eq!(
4284                    priceset_improved_mask8_avx2(&nc8, &np8),
4285                    scalar_mask::<8>(&nc8, &np8)
4286                );
4287            }
4288        }
4289    }
4290}
4291
4292/// Shared vectorised price-set loop body, generic over the SIMD width `W`.
4293/// The per-tier `deint` (vector-load plus deinterleave of `W` cached prices,
4294/// returning `Some` only on an all-warm chunk) and `mask` (per-tier
4295/// `next_cost` less-than `node_price` bitmask) are passed as zero-sized
4296/// `impl Fn`s. `#[inline(always)]` plus monomorphisation folds `deint` and
4297/// `mask` directly into each per-tier wrapper's `target_feature` umbrella, so
4298/// the intrinsics inline with no call ABI and no runtime feature detection.
4299/// Cold or out-of-cache chunks, and the sub-`W` remainder, fall back to the
4300/// scalar `priceset_next_cost` (which fills the cache); writes are
4301/// scalar-scatter on the improving lanes (1-8% of compares, per the
4302/// improve-ratio probe). Same signature tail as the scalar variant.
4303#[inline(always)]
4304#[allow(clippy::too_many_arguments)]
4305// Instantiated only by a vector tier wrapper (avx2/sse4.1 on x86, neon on
4306// aarch64, simd128 on wasm+simd128); a target with none of those (e.g.
4307// wasm without +simd128) uses only the scalar range, leaving this generic dead.
4308#[cfg_attr(
4309    not(any(
4310        target_arch = "x86",
4311        target_arch = "x86_64",
4312        all(target_arch = "aarch64", target_endian = "little"),
4313        all(target_arch = "wasm32", target_feature = "simd128")
4314    )),
4315    allow(dead_code)
4316)]
4317fn priceset_range_vec<const W: usize>(
4318    node_prices: &mut [u32],
4319    nodes: &mut [HcOptimalNode],
4320    ml_cache: &mut [[u32; 2]],
4321    ml_stamp: u32,
4322    profile: HcOptimalCostProfile,
4323    stats: &HcOptState,
4324    pos: usize,
4325    start: usize,
4326    max: usize,
4327    ll0_price: u32,
4328    off_price: u32,
4329    base_cost: u32,
4330    off: u32,
4331    reps: [u32; 3],
4332    last_pos: usize,
4333    deint: impl Fn(&[[u32; 2]], u32) -> Option<[u32; W]>,
4334    mask: impl Fn(&[u32; W], &[u32]) -> u8,
4335) -> usize {
4336    let mut new_last = last_pos;
4337    let mut buf = [0u32; W];
4338    // Loop-invariant constant of the byte-identical next_cost chain:
4339    // next_cost = add_prices(base_cost, add_prices(ll0_price,
4340    //   match_price_from_parts(off_price, ml_price))) = c_base + ml_price,
4341    // c_base = base_cost + ll0_price + match_price_from_parts(off_price, 0).
4342    //
4343    // This stays bit-exact with the scalar `priceset_next_cost` because both
4344    // helpers are affine in `ml_price`: `BtMatcher::add_prices(a, b) = a + b`
4345    // and `match_price_from_parts(off, ml) = off + ml + bias` are plain integer
4346    // additions, so `match_price_from_parts(off, ml) = match_price_from_parts(
4347    // off, 0) + ml` and the whole chain collapses to `c_base + ml_price`. The
4348    // `wrapping_add` here matches the scalar `+` under the cost model's
4349    // no-overflow invariant (the `debug_assert`s in both helpers). Factoring the
4350    // combine into one helper per the review suggestion would force a per-lane
4351    // `match_price_from_parts(off, ml_price)` recompute instead of hoisting the
4352    // ml-independent `c_base` once — a regression on this hot DP loop — so the
4353    // hoist is kept and the equivalence documented here instead.
4354    let c_base = base_cost
4355        .wrapping_add(ll0_price)
4356        .wrapping_add(profile.match_price_from_parts(off_price, 0, stats));
4357    let mut ml = start;
4358    while ml + W <= max + 1 {
4359        let vectorised = if ml + W <= ml_cache.len() {
4360            deint(&ml_cache[ml..ml + W], ml_stamp)
4361        } else {
4362            None
4363        };
4364        if let Some(prices) = vectorised {
4365            for (k, slot) in buf.iter_mut().enumerate() {
4366                *slot = c_base.wrapping_add(prices[k]);
4367            }
4368        } else {
4369            for (k, slot) in buf.iter_mut().enumerate() {
4370                *slot = priceset_next_cost(
4371                    profile,
4372                    stats,
4373                    ml_cache,
4374                    ml_stamp,
4375                    ml + k,
4376                    ll0_price,
4377                    off_price,
4378                    base_cost,
4379                );
4380            }
4381        }
4382        let base_next = pos + ml;
4383        let mut bits = mask(&buf, &node_prices[base_next..base_next + W]);
4384        while bits != 0 {
4385            let k = bits.trailing_zeros() as usize;
4386            bits &= bits - 1;
4387            let next = base_next + k;
4388            node_prices[next] = buf[k];
4389            nodes[next] = HcOptimalNode {
4390                off,
4391                mlen: (ml + k) as u32,
4392                litlen: 0,
4393                reps,
4394            };
4395            if next > new_last {
4396                new_last = next;
4397            }
4398        }
4399        ml += W;
4400    }
4401    while ml <= max {
4402        let next_cost = priceset_next_cost(
4403            profile, stats, ml_cache, ml_stamp, ml, ll0_price, off_price, base_cost,
4404        );
4405        let next = pos + ml;
4406        if next_cost < node_prices[next] {
4407            node_prices[next] = next_cost;
4408            nodes[next] = HcOptimalNode {
4409                off,
4410                mlen: ml as u32,
4411                litlen: 0,
4412                reps,
4413            };
4414            if next > new_last {
4415                new_last = next;
4416            }
4417        }
4418        ml += 1;
4419    }
4420    new_last
4421}
4422
4423/// Vector-load 8 cached ml-prices for the optimal parser's price-set, given a
4424/// run of 8 contiguous `[price, generation]` cells. Returns `Some(prices)`
4425/// only when ALL eight cells are warm (`generation == stamp`) — the common
4426/// (~91-98%) case — so the caller can fold them with one broadcast constant;
4427/// any cold cell returns `None` to route the chunk through the scalar fill
4428/// (which recomputes + repopulates the misses). Deinterleaves with cheap
4429/// in-128-lane ops (`shuffle_epi32` + `unpack*_epi64`) and a single cross-lane
4430/// `permute4x64` for the ordered prices — avoiding the latency-bound chain of
4431/// cross-lane `permutevar8x32`s that lost to pipelined scalar loads on
4432/// high-chunk-count fixtures.
4433#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
4434#[target_feature(enable = "avx2")]
4435#[inline]
4436unsafe fn priceset_cached_prices8_avx2(cells: &[[u32; 2]], stamp: u32) -> Option<[u32; 8]> {
4437    #[cfg(target_arch = "x86")]
4438    use core::arch::x86::{
4439        __m256i, _mm256_castsi256_ps, _mm256_cmpeq_epi32, _mm256_loadu_si256, _mm256_movemask_ps,
4440        _mm256_permute4x64_epi64, _mm256_set1_epi32, _mm256_shuffle_epi32, _mm256_storeu_si256,
4441        _mm256_unpackhi_epi64, _mm256_unpacklo_epi64,
4442    };
4443    #[cfg(target_arch = "x86_64")]
4444    use core::arch::x86_64::{
4445        __m256i, _mm256_castsi256_ps, _mm256_cmpeq_epi32, _mm256_loadu_si256, _mm256_movemask_ps,
4446        _mm256_permute4x64_epi64, _mm256_set1_epi32, _mm256_shuffle_epi32, _mm256_storeu_si256,
4447        _mm256_unpackhi_epi64, _mm256_unpacklo_epi64,
4448    };
4449    debug_assert!(cells.len() >= 8);
4450    let base = cells.as_ptr() as *const __m256i;
4451    // v0 = [p0 g0 p1 g1 | p2 g2 p3 g3], v1 = [p4 g4 p5 g5 | p6 g6 p7 g7].
4452    let v0 = unsafe { _mm256_loadu_si256(base) };
4453    let v1 = unsafe { _mm256_loadu_si256(base.add(1)) };
4454    // In-128-lane group prices then gens: [p g p g] -> [p p g g] (control 0xD8).
4455    let s0 = _mm256_shuffle_epi32(v0, 0xD8); // [p0 p1 g0 g1 | p2 p3 g2 g3]
4456    let s1 = _mm256_shuffle_epi32(v1, 0xD8); // [p4 p5 g4 g5 | p6 p7 g6 g7]
4457    // Gens (hi 64 of each 128-lane) — order irrelevant for the all-equal test.
4458    let gens = _mm256_unpackhi_epi64(s0, s1);
4459    let eq = _mm256_cmpeq_epi32(gens, _mm256_set1_epi32(stamp as i32));
4460    if _mm256_movemask_ps(_mm256_castsi256_ps(eq)) as u8 != 0xFF {
4461        return None;
4462    }
4463    // Prices (lo 64 of each 128-lane): [p0 p1 p4 p5 | p2 p3 p6 p7] as 64-bit
4464    // chunks [c0 c1 c2 c3] = [p0p1 p4p5 p2p3 p6p7]; reorder to [c0 c2 c1 c3]
4465    // (control 0xD8) for in-order [p0..p7].
4466    let p_scrambled = _mm256_unpacklo_epi64(s0, s1);
4467    let prices = _mm256_permute4x64_epi64(p_scrambled, 0xD8);
4468    let mut out = [0u32; 8];
4469    unsafe { _mm256_storeu_si256(out.as_mut_ptr() as *mut __m256i, prices) };
4470    Some(out)
4471}
4472
4473#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
4474#[target_feature(enable = "avx2")]
4475#[inline]
4476#[allow(clippy::too_many_arguments)]
4477unsafe fn priceset_range_nonabort_avx2(
4478    node_prices: &mut [u32],
4479    nodes: &mut [HcOptimalNode],
4480    ml_cache: &mut [[u32; 2]],
4481    ml_stamp: u32,
4482    profile: HcOptimalCostProfile,
4483    stats: &HcOptState,
4484    pos: usize,
4485    start: usize,
4486    max: usize,
4487    ll0_price: u32,
4488    off_price: u32,
4489    base_cost: u32,
4490    off: u32,
4491    reps: [u32; 3],
4492    last_pos: usize,
4493) -> usize {
4494    priceset_range_vec::<8>(
4495        node_prices,
4496        nodes,
4497        ml_cache,
4498        ml_stamp,
4499        profile,
4500        stats,
4501        pos,
4502        start,
4503        max,
4504        ll0_price,
4505        off_price,
4506        base_cost,
4507        off,
4508        reps,
4509        last_pos,
4510        // SAFETY: both closures run inside this fn's avx2 target_feature umbrella.
4511        |cells, stamp| unsafe { priceset_cached_prices8_avx2(cells, stamp) },
4512        |nc, np| unsafe { priceset_improved_mask8_avx2(nc, np) },
4513    )
4514}
4515
4516/// NEON 4-lane vector-load + deinterleave of cached ml-prices. `vld2q_u32`
4517/// deinterleaves the 4 contiguous `[price, generation]` pairs natively into
4518/// two registers (prices, gens) — no shuffle chain. `Some(prices)` only when
4519/// all 4 generations equal `stamp` (`vminvq` of the equality mask is all-ones).
4520#[cfg(all(target_arch = "aarch64", target_endian = "little"))]
4521#[target_feature(enable = "neon")]
4522#[inline]
4523unsafe fn priceset_cached_prices4_neon(cells: &[[u32; 2]], stamp: u32) -> Option<[u32; 4]> {
4524    use core::arch::aarch64::{vceqq_u32, vdupq_n_u32, vld2q_u32, vminvq_u32, vst1q_u32};
4525    debug_assert!(cells.len() >= 4);
4526    // SAFETY: caller's neon umbrella; `cells` is >= 4 pairs = 8 contiguous u32.
4527    let pair = unsafe { vld2q_u32(cells.as_ptr() as *const u32) };
4528    let eq = vceqq_u32(pair.1, vdupq_n_u32(stamp));
4529    if vminvq_u32(eq) != u32::MAX {
4530        return None;
4531    }
4532    let mut out = [0u32; 4];
4533    unsafe { vst1q_u32(out.as_mut_ptr(), pair.0) };
4534    Some(out)
4535}
4536
4537/// NEON 4-lane `next_cost < node_price` bitmask. NEON has an unsigned compare
4538/// (`vcltq_u32`) but no movemask; AND the all-ones lane mask with lane weights
4539/// `[1,2,4,8]` and horizontal-add (`vaddvq_u32`) to pack the 4 bits.
4540#[cfg(all(target_arch = "aarch64", target_endian = "little"))]
4541#[target_feature(enable = "neon")]
4542#[inline]
4543unsafe fn priceset_improved_mask4_neon(next_cost: &[u32; 4], node_price: &[u32]) -> u8 {
4544    use core::arch::aarch64::{vaddvq_u32, vandq_u32, vcltq_u32, vld1q_u32, vst1q_u32};
4545    // SAFETY: neon umbrella; both spans are 4 u32 wide.
4546    let nc = unsafe { vld1q_u32(next_cost.as_ptr()) };
4547    let np = unsafe { vld1q_u32(node_price.as_ptr()) };
4548    let lt = vcltq_u32(nc, np);
4549    let weights: [u32; 4] = [1, 2, 4, 8];
4550    let w = unsafe { vld1q_u32(weights.as_ptr()) };
4551    let bits = vandq_u32(lt, w);
4552    let _ = vst1q_u32; // silence unused import on some toolchains
4553    vaddvq_u32(bits) as u8
4554}
4555
4556#[cfg(all(target_arch = "aarch64", target_endian = "little"))]
4557#[target_feature(enable = "neon")]
4558#[inline]
4559#[allow(clippy::too_many_arguments)]
4560unsafe fn priceset_range_nonabort_neon(
4561    node_prices: &mut [u32],
4562    nodes: &mut [HcOptimalNode],
4563    ml_cache: &mut [[u32; 2]],
4564    ml_stamp: u32,
4565    profile: HcOptimalCostProfile,
4566    stats: &HcOptState,
4567    pos: usize,
4568    start: usize,
4569    max: usize,
4570    ll0_price: u32,
4571    off_price: u32,
4572    base_cost: u32,
4573    off: u32,
4574    reps: [u32; 3],
4575    last_pos: usize,
4576) -> usize {
4577    priceset_range_vec::<4>(
4578        node_prices,
4579        nodes,
4580        ml_cache,
4581        ml_stamp,
4582        profile,
4583        stats,
4584        pos,
4585        start,
4586        max,
4587        ll0_price,
4588        off_price,
4589        base_cost,
4590        off,
4591        reps,
4592        last_pos,
4593        // SAFETY: both closures run inside this fn's neon target_feature umbrella.
4594        |cells, stamp| unsafe { priceset_cached_prices4_neon(cells, stamp) },
4595        |nc, np| unsafe { priceset_improved_mask4_neon(nc, np) },
4596    )
4597}
4598
4599/// SSE4.1 4-lane vector-load + deinterleave of cached ml-prices. Two 128-bit
4600/// loads of `[price, gen]` pairs, `shuffle_epi32(0xD8)` groups prices then gens
4601/// within each, `unpacklo/hi_epi64` separates them. `Some(prices)` only when
4602/// all 4 generations equal `stamp`.
4603#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
4604#[target_feature(enable = "sse4.2")]
4605#[inline]
4606unsafe fn priceset_cached_prices4_sse41(cells: &[[u32; 2]], stamp: u32) -> Option<[u32; 4]> {
4607    #[cfg(target_arch = "x86")]
4608    use core::arch::x86::{
4609        __m128i, _mm_castsi128_ps, _mm_cmpeq_epi32, _mm_loadu_si128, _mm_movemask_ps,
4610        _mm_set1_epi32, _mm_shuffle_epi32, _mm_storeu_si128, _mm_unpackhi_epi64,
4611        _mm_unpacklo_epi64,
4612    };
4613    #[cfg(target_arch = "x86_64")]
4614    use core::arch::x86_64::{
4615        __m128i, _mm_castsi128_ps, _mm_cmpeq_epi32, _mm_loadu_si128, _mm_movemask_ps,
4616        _mm_set1_epi32, _mm_shuffle_epi32, _mm_storeu_si128, _mm_unpackhi_epi64,
4617        _mm_unpacklo_epi64,
4618    };
4619    debug_assert!(cells.len() >= 4);
4620    let base = cells.as_ptr() as *const __m128i;
4621    let v0 = unsafe { _mm_loadu_si128(base) }; // [p0 g0 p1 g1]
4622    let v1 = unsafe { _mm_loadu_si128(base.add(1)) }; // [p2 g2 p3 g3]
4623    let s0 = _mm_shuffle_epi32(v0, 0xD8); // [p0 p1 g0 g1]
4624    let s1 = _mm_shuffle_epi32(v1, 0xD8); // [p2 p3 g2 g3]
4625    let gens = _mm_unpackhi_epi64(s0, s1); // [g0 g1 g2 g3]
4626    let eq = _mm_cmpeq_epi32(gens, _mm_set1_epi32(stamp as i32));
4627    if _mm_movemask_ps(_mm_castsi128_ps(eq)) as u8 & 0x0F != 0x0F {
4628        return None;
4629    }
4630    let prices = _mm_unpacklo_epi64(s0, s1); // [p0 p1 p2 p3]
4631    let mut out = [0u32; 4];
4632    unsafe { _mm_storeu_si128(out.as_mut_ptr() as *mut __m128i, prices) };
4633    Some(out)
4634}
4635
4636/// SSE4.1 4-lane `next_cost < node_price` bitmask (unsigned compare via
4637/// `min_epu32`, like the AVX2 path).
4638#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
4639#[target_feature(enable = "sse4.2")]
4640#[inline]
4641unsafe fn priceset_improved_mask4_sse41(next_cost: &[u32; 4], node_price: &[u32]) -> u8 {
4642    #[cfg(target_arch = "x86")]
4643    use core::arch::x86::{
4644        __m128i, _mm_andnot_si128, _mm_castsi128_ps, _mm_cmpeq_epi32, _mm_loadu_si128,
4645        _mm_min_epu32, _mm_movemask_ps,
4646    };
4647    #[cfg(target_arch = "x86_64")]
4648    use core::arch::x86_64::{
4649        __m128i, _mm_andnot_si128, _mm_castsi128_ps, _mm_cmpeq_epi32, _mm_loadu_si128,
4650        _mm_min_epu32, _mm_movemask_ps,
4651    };
4652    let nc = unsafe { _mm_loadu_si128(next_cost.as_ptr() as *const __m128i) };
4653    let np = unsafe { _mm_loadu_si128(node_price.as_ptr() as *const __m128i) };
4654    let min = _mm_min_epu32(nc, np);
4655    let le = _mm_cmpeq_epi32(min, nc);
4656    let eq = _mm_cmpeq_epi32(nc, np);
4657    let lt = _mm_andnot_si128(eq, le);
4658    (_mm_movemask_ps(_mm_castsi128_ps(lt)) as u8) & 0x0F
4659}
4660
4661#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
4662#[target_feature(enable = "sse4.2")]
4663#[inline]
4664#[allow(clippy::too_many_arguments)]
4665unsafe fn priceset_range_nonabort_sse41(
4666    node_prices: &mut [u32],
4667    nodes: &mut [HcOptimalNode],
4668    ml_cache: &mut [[u32; 2]],
4669    ml_stamp: u32,
4670    profile: HcOptimalCostProfile,
4671    stats: &HcOptState,
4672    pos: usize,
4673    start: usize,
4674    max: usize,
4675    ll0_price: u32,
4676    off_price: u32,
4677    base_cost: u32,
4678    off: u32,
4679    reps: [u32; 3],
4680    last_pos: usize,
4681) -> usize {
4682    priceset_range_vec::<4>(
4683        node_prices,
4684        nodes,
4685        ml_cache,
4686        ml_stamp,
4687        profile,
4688        stats,
4689        pos,
4690        start,
4691        max,
4692        ll0_price,
4693        off_price,
4694        base_cost,
4695        off,
4696        reps,
4697        last_pos,
4698        // SAFETY: both closures run inside this fn's sse4.2 target_feature umbrella.
4699        |cells, stamp| unsafe { priceset_cached_prices4_sse41(cells, stamp) },
4700        |nc, np| unsafe { priceset_improved_mask4_sse41(nc, np) },
4701    )
4702}
4703
4704/// wasm `simd128` 4-lane vector-load + deinterleave of cached ml-prices.
4705/// `u32x4_shuffle` selects the price (even) and gen (odd) lanes across the two
4706/// loaded vectors natively. `Some(prices)` only when all 4 gens equal `stamp`
4707/// (`u32x4_all_true` of the equality vector).
4708#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
4709#[target_feature(enable = "simd128")]
4710#[inline]
4711unsafe fn priceset_cached_prices4_simd128(cells: &[[u32; 2]], stamp: u32) -> Option<[u32; 4]> {
4712    use core::arch::wasm32::{
4713        u32x4_all_true, u32x4_eq, u32x4_shuffle, u32x4_splat, v128, v128_load, v128_store,
4714    };
4715    debug_assert!(cells.len() >= 4);
4716    let base = cells.as_ptr() as *const v128;
4717    let v0 = unsafe { v128_load(base) }; // [p0 g0 p1 g1]
4718    let v1 = unsafe { v128_load(base.add(1)) }; // [p2 g2 p3 g3]
4719    // Lanes 0..3 index v0, 4..7 index v1.
4720    let gens = u32x4_shuffle::<1, 3, 5, 7>(v0, v1); // [g0 g1 g2 g3]
4721    let eq = u32x4_eq(gens, u32x4_splat(stamp));
4722    if !u32x4_all_true(eq) {
4723        return None;
4724    }
4725    let prices = u32x4_shuffle::<0, 2, 4, 6>(v0, v1); // [p0 p1 p2 p3]
4726    let mut out = [0u32; 4];
4727    unsafe { v128_store(out.as_mut_ptr() as *mut v128, prices) };
4728    Some(out)
4729}
4730
4731/// wasm `simd128` 4-lane `next_cost < node_price` bitmask. wasm has a native
4732/// unsigned compare (`u32x4_lt`) and `u32x4_bitmask` to pack the lanes.
4733#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
4734#[target_feature(enable = "simd128")]
4735#[inline]
4736unsafe fn priceset_improved_mask4_simd128(next_cost: &[u32; 4], node_price: &[u32]) -> u8 {
4737    use core::arch::wasm32::{u32x4_bitmask, u32x4_lt, v128, v128_load};
4738    let nc = unsafe { v128_load(next_cost.as_ptr() as *const v128) };
4739    let np = unsafe { v128_load(node_price.as_ptr() as *const v128) };
4740    u32x4_bitmask(u32x4_lt(nc, np))
4741}
4742
4743#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
4744#[target_feature(enable = "simd128")]
4745#[inline]
4746#[allow(clippy::too_many_arguments)]
4747unsafe fn priceset_range_nonabort_simd128(
4748    node_prices: &mut [u32],
4749    nodes: &mut [HcOptimalNode],
4750    ml_cache: &mut [[u32; 2]],
4751    ml_stamp: u32,
4752    profile: HcOptimalCostProfile,
4753    stats: &HcOptState,
4754    pos: usize,
4755    start: usize,
4756    max: usize,
4757    ll0_price: u32,
4758    off_price: u32,
4759    base_cost: u32,
4760    off: u32,
4761    reps: [u32; 3],
4762    last_pos: usize,
4763) -> usize {
4764    priceset_range_vec::<4>(
4765        node_prices,
4766        nodes,
4767        ml_cache,
4768        ml_stamp,
4769        profile,
4770        stats,
4771        pos,
4772        start,
4773        max,
4774        ll0_price,
4775        off_price,
4776        base_cost,
4777        off,
4778        reps,
4779        last_pos,
4780        // SAFETY: both closures run inside this fn's simd128 target_feature umbrella.
4781        |cells, stamp| unsafe { priceset_cached_prices4_simd128(cells, stamp) },
4782        |nc, np| unsafe { priceset_improved_mask4_simd128(nc, np) },
4783    )
4784}
4785
4786macro_rules! build_optimal_plan_impl_body {
4787    (
4788        $self:expr,
4789        $strategy_ty:ty,
4790        $current:ident,
4791        $current_abs_start:ident,
4792        $current_len:ident,
4793        $initial_state:ident,
4794        $stats:ident,
4795        $out:ident,
4796        $collect:ident,
4797        $priceset:path $(,)?
4798    ) => {{
4799        let current_abs_end = $current_abs_start + $current_len;
4800        let min_match_len = HC_OPT_MIN_MATCH_LEN;
4801        // `HC_OPT_NUM > 0` by const definition, so `HC_OPT_NUM - 1` is safe.
4802        let frontier_limit = $current_len.min(HC_OPT_NUM - 1);
4803        let initial_reps = $initial_state.reps;
4804        let initial_litlen = $initial_state.litlen;
4805        let ldm_block_offset = $initial_state.block_offset;
4806        let mut profile = $initial_state.profile;
4807        profile.sufficient_match_len = $self.hc.sufficient_match_len_for_pass(profile);
4808        // Const-fold from the strategy's associated `OPT_LEVEL`
4809        // (upstream zstd `optLevel`): BtOpt = 0, BtUltra / BtUltra2 = 2.
4810        // The two flags below are the only places the inner DP loop
4811        // used to consult `parse_mode`; lifting them into const
4812        // expressions drops one indirect read + one branch on every
4813        // candidate insertion and every traceback step.
4814        // `let` (not `const`) — nested `const` items inside a
4815        // generic fn cannot project through the outer fn's type
4816        // parameter, but a `let` binding from a const expression
4817        // does get folded by the optimiser per monomorphisation,
4818        // which is what we actually want here.
4819        debug_assert!(
4820            <$strategy_ty as super::strategy::Strategy>::USE_BT,
4821            "build_optimal_plan_impl_body called on non-BT strategy"
4822        );
4823        let abort_on_worse_match: bool =
4824            <$strategy_ty as super::strategy::Strategy>::OPT_LEVEL == 0;
4825        let opt_level: bool = <$strategy_ty as super::strategy::Strategy>::OPT_LEVEL >= 2;
4826        let mut nodes = core::mem::take(&mut $self.backend.bt_mut().opt_nodes_scratch);
4827        let mut node_prices = core::mem::take(&mut $self.backend.bt_mut().opt_node_prices_scratch);
4828        // `frontier_limit + 2 <= HC_OPT_NODE_LEN` — bounded by const.
4829        let frontier_buffer_size = frontier_limit + 2;
4830        if nodes.len() < HC_OPT_NODE_LEN {
4831            // First optimal-parse use (empty boxed slice) or an undersized
4832            // buffer: allocate the fixed upstream-zstd-sized frontier once. The DP
4833            // overwrites the active prefix before reading it.
4834            nodes = alloc::vec![HcOptimalNode::default(); HC_OPT_NODE_LEN].into_boxed_slice();
4835        }
4836        // The DP price array, same fixed length as `nodes`. This is the SOLE
4837        // home of each position's price (the node struct carries no price), so
4838        // the SIMD price-set vector-loads it directly. Initialised to u32::MAX
4839        // so unwritten frontier cells compare as "unreachable".
4840        if node_prices.len() < HC_OPT_NODE_LEN {
4841            node_prices = alloc::vec![u32::MAX; HC_OPT_NODE_LEN].into_boxed_slice();
4842        }
4843        let mut candidates = core::mem::take(&mut $self.backend.bt_mut().opt_candidates_scratch);
4844        candidates.clear();
4845        if candidates.capacity() < MAX_HC_SEARCH_DEPTH {
4846            candidates.reserve_exact(MAX_HC_SEARCH_DEPTH - candidates.capacity());
4847        }
4848        let mut store = core::mem::take(&mut $self.backend.bt_mut().opt_store_scratch);
4849        store.clear();
4850        let mut price_arena = core::mem::take(&mut $self.backend.bt_mut().opt_price_arena);
4851        if price_arena.len() < HC_OPT_PRICE_ARENA_LEN {
4852            price_arena = alloc::vec![[0u32; 2]; HC_OPT_PRICE_ARENA_LEN].into_boxed_slice();
4853        }
4854        // Single arena → two disjoint fixed-stride regions of `[price,
4855        // generation]` pairs (LL cache, ML cache): one base pointer + fixed
4856        // offsets, mirroring upstream zstd's single opt workspace. Pairing
4857        // price+generation per code keeps the optimal parser's cache probe
4858        // on ONE line instead of two strided regions.
4859        // SAFETY: `price_arena` is exactly `HC_OPT_PRICE_ARENA_LEN =
4860        // 2 * HC_OPT_PRICE_STRIDE` pairs long (just ensured), so the two
4861        // STRIDE-wide regions are in bounds and disjoint. The slices alias
4862        // the heap buffer `price_arena` owns; that heap address is stable
4863        // across the later move of the `price_arena` box into the result
4864        // bundle (a `Box` move relocates only the pointer, not the heap
4865        // data), and the slices are never used after the bundle is
4866        // constructed. The fixed STRIDE (independent of `frontier_limit`)
4867        // keeps every code's cell at a constant offset so the monotonic
4868        // stamps stay valid across calls with different frontiers.
4869        let arena_base = price_arena.as_mut_ptr();
4870        let mut ll_cache: &mut [[u32; 2]] =
4871            unsafe { core::slice::from_raw_parts_mut(arena_base, HC_OPT_PRICE_STRIDE) };
4872        let mut ml_cache: &mut [[u32; 2]] = unsafe {
4873            core::slice::from_raw_parts_mut(arena_base.add(HC_OPT_PRICE_STRIDE), HC_OPT_PRICE_STRIDE)
4874        };
4875        $self.backend.bt_mut().opt_ll_price_stamp = $self
4876            .backend
4877            .bt_mut()
4878            .opt_ll_price_stamp
4879            .wrapping_add(1)
4880            .max(1);
4881        let ll_price_stamp = $self.backend.bt_mut().opt_ll_price_stamp;
4882        $self.backend.bt_mut().opt_lit_price_stamp = $self
4883            .backend
4884            .bt_mut()
4885            .opt_lit_price_stamp
4886            .wrapping_add(1)
4887            .max(1);
4888        let lit_price_stamp = $self.backend.bt_mut().opt_lit_price_stamp;
4889        $self.backend.bt_mut().opt_ml_price_stamp = $self
4890            .backend
4891            .bt_mut()
4892            .opt_ml_price_stamp
4893            .wrapping_add(1)
4894            .max(1);
4895        let ml_price_stamp = $self.backend.bt_mut().opt_ml_price_stamp;
4896        let node0_price = BtMatcher::cached_lit_length_price(
4897            profile,
4898            $stats,
4899            initial_litlen,
4900            &mut ll_cache,
4901            ll_price_stamp,
4902        );
4903        nodes[0] = HcOptimalNode {
4904            litlen: initial_litlen as u32,
4905            reps: initial_reps,
4906            ..HcOptimalNode::default()
4907        };
4908        node_prices[0] = node0_price;
4909        let sufficient_len = profile.sufficient_match_len;
4910        let ll0_price = BtMatcher::cached_lit_length_price(
4911            profile,
4912            $stats,
4913            0,
4914            &mut ll_cache,
4915            ll_price_stamp,
4916        );
4917        let ll1_price = BtMatcher::cached_lit_length_price(
4918            profile,
4919            $stats,
4920            1,
4921            &mut ll_cache,
4922            ll_price_stamp,
4923        );
4924        let mut pos = 1usize;
4925        let mut last_pos = 0usize;
4926        let mut forced_end: Option<usize> = None;
4927        let mut forced_end_state: Option<HcOptimalNode> = None;
4928        // Price companion of `forced_end_state` (price no longer lives in the
4929        // node struct; tracked alongside the forced-seed node).
4930        let mut forced_end_price: Option<u32> = None;
4931        let mut seed_forced_shortest_path = false;
4932        let mut opt_ldm = HcOptLdmState {
4933            seq_store: HcRawSeqStore {
4934                pos: 0,
4935                pos_in_sequence: 0,
4936                size: $self.backend.bt_mut().ldm_sequences.len(),
4937            },
4938            ..HcOptLdmState::default()
4939        };
4940        let has_ldm = !$self.backend.bt_mut().ldm_sequences.is_empty();
4941        if has_ldm {
4942            // `ldm_sequences` are emitted in BLOCK-relative coordinates,
4943            // but this optimal-parser pass runs over a SEGMENT of the
4944            // block starting at block-offset `$block_offset` and uses
4945            // segment-relative positions throughout. Fast-forward the raw
4946            // seq-store cursor past the bytes covered by earlier segments
4947            // so the (segment-relative) LDM windows below land at the
4948            // correct positions. Idempotent: `ldm_skip_raw_seq_store_bytes`
4949            // recomputes from `pos = 0`, so re-running it per segment is
4950            // safe. Without this, every segment after the first re-applied
4951            // the block's leading LDM windows at the wrong offset, emitting
4952            // matches that copy the wrong bytes (undecodable frame).
4953            if ldm_block_offset > 0 {
4954                $self
4955                    .backend
4956                    .bt_mut()
4957                    .ldm_skip_raw_seq_store_bytes(&mut opt_ldm.seq_store, ldm_block_offset);
4958            }
4959            $self
4960                .backend
4961                .bt_mut()
4962                .ldm_get_next_match_and_update_seq_store(&mut opt_ldm, 0, $current_len);
4963        }
4964
4965        // Upstream zstd-like seed at rPos=0: initialize frontier with matches starting
4966        // at current position before entering the generic forward DP loop.
4967        if $current_len >= min_match_len {
4968            let seed_ldm = if has_ldm {
4969                $self.backend.bt_mut().ldm_process_match_candidate(
4970                    &mut opt_ldm,
4971                    0,
4972                    $current_len,
4973                    min_match_len,
4974                )
4975            } else {
4976                None
4977            };
4978            candidates.clear();
4979            // SAFETY: wrapper is in the same target_feature umbrella as the
4980            // `$collect` kernel variant; the runtime kernel detector already
4981            // gated entry into the wrapper.
4982            unsafe {
4983                $self.$collect::<$strategy_ty, true>(
4984                    $current_abs_start,
4985                    current_abs_end,
4986                    profile,
4987                    HcCandidateQuery {
4988                        reps: initial_reps,
4989                        lit_len: initial_litlen,
4990                        ldm_candidate: seed_ldm,
4991                    },
4992                    &mut candidates,
4993                )
4994            };
4995            if !candidates.is_empty() {
4996                // `min_match_len >= HC_FORMAT_MINMATCH (3)` by invariant.
4997                last_pos = (min_match_len - 1).min(frontier_limit);
4998                for p in 1..min_match_len.min(frontier_buffer_size) {
4999                    BtMatcher::reset_opt_node(&mut nodes[p]);
5000                    // Reset the price (sole home; the node carries none).
5001                    node_prices[p] = u32::MAX;
5002                    // `initial_litlen` is the litlen carried from prior
5003                    // optimal-plan segments — its real bound is the
5004                    // current block length (the frame compressor caps
5005                    // block scan at `HC_BLOCKSIZE_MAX`), not the segment
5006                    // `current_len`. `p < min_match_len` (small constant),
5007                    // so the sum stays well within `u32::MAX`. Use
5008                    // `checked_add` FIRST so the `usize` addition itself
5009                    // cannot overflow on i686 (where `usize` is 32-bit
5010                    // and a wrapping `+` would slip past `try_from`).
5011                    let seed_litlen = initial_litlen
5012                        .checked_add(p)
5013                        .and_then(|s| u32::try_from(s).ok())
5014                        .expect("optimal parser seed litlen out of u32 range");
5015                    nodes[p].litlen = seed_litlen;
5016                }
5017            }
5018
5019            if let Some(candidate) = candidates.last() {
5020                let longest_len = candidate.match_len.min($current_len);
5021                if longest_len > sufficient_len {
5022                    let off_base = BtMatcher::encode_offset_base_with_reps(
5023                        candidate.offset as u32,
5024                        initial_litlen,
5025                        initial_reps,
5026                    );
5027                    let off_price = profile
5028                        .offset_price_for::<ACCURATE_PRICE, FAVOR_SMALL_OFFSETS>($stats, off_base);
5029                    let ml_price = BtMatcher::cached_match_length_price(
5030                        profile,
5031                        $stats,
5032                        longest_len,
5033                        &mut ml_cache,
5034                        ml_price_stamp,
5035                    );
5036                    let seq_cost = BtMatcher::add_prices(
5037                        ll0_price,
5038                        profile.match_price_from_parts(off_price, ml_price, $stats),
5039                    );
5040                    let forced_price = BtMatcher::add_prices(node_prices[0], seq_cost);
5041                    let forced_state = HcOptimalNode {
5042                        off: candidate.offset as u32,
5043                        mlen: longest_len as u32,
5044                        litlen: 0,
5045                        reps: initial_reps,
5046                    };
5047                    if longest_len < frontier_buffer_size && forced_price < node_prices[longest_len] {
5048                        nodes[longest_len] = forced_state;
5049                        node_prices[longest_len] = forced_price;
5050                    }
5051                    forced_end = Some(longest_len);
5052                    forced_end_state = Some(forced_state);
5053                    forced_end_price = Some(forced_price);
5054                    seed_forced_shortest_path = true;
5055                }
5056            }
5057            if !seed_forced_shortest_path {
5058                let mut prev_max_len = min_match_len - 1;
5059                for candidate in candidates.iter() {
5060                    let max_match_len = candidate.match_len.min(frontier_limit);
5061                    if max_match_len < min_match_len {
5062                        continue;
5063                    }
5064                    let start_len = (prev_max_len + 1).max(min_match_len);
5065                    if start_len > max_match_len {
5066                        prev_max_len = prev_max_len.max(max_match_len);
5067                        continue;
5068                    }
5069                    if max_match_len > last_pos {
5070                        BtMatcher::reset_opt_nodes(
5071                            &mut nodes,
5072                            &mut node_prices,
5073                            last_pos + 1,
5074                            max_match_len,
5075                        );
5076                    }
5077                    let off_base = BtMatcher::encode_offset_base_with_reps(
5078                        candidate.offset as u32,
5079                        initial_litlen,
5080                        initial_reps,
5081                    );
5082                    let off_price = profile
5083                        .offset_price_for::<ACCURATE_PRICE, FAVOR_SMALL_OFFSETS>($stats, off_base);
5084                    debug_assert!(max_match_len < frontier_buffer_size);
5085                    let nodes0_price = node_prices[0];
5086                    for match_len in (start_len..=max_match_len).rev() {
5087                        let ml_price = BtMatcher::cached_match_length_price(
5088                            profile,
5089                            $stats,
5090                            match_len,
5091                            &mut ml_cache,
5092                            ml_price_stamp,
5093                        );
5094                        let seq_cost = BtMatcher::add_prices(
5095                            ll0_price,
5096                            profile.match_price_from_parts(off_price, ml_price, $stats),
5097                        );
5098                        let next_cost = BtMatcher::add_prices(nodes0_price, seq_cost);
5099                        let node_price = unsafe { *node_prices.get_unchecked(match_len) };
5100                        if match_len > last_pos || next_cost < node_price {
5101                            let slot = unsafe { nodes.get_unchecked_mut(match_len) };
5102                            *slot = HcOptimalNode {
5103                                off: candidate.offset as u32,
5104                                mlen: match_len as u32,
5105                                litlen: 0,
5106                                reps: initial_reps,
5107                            };
5108                            unsafe { *node_prices.get_unchecked_mut(match_len) = next_cost };
5109                            if match_len > last_pos {
5110                                last_pos = match_len;
5111                            }
5112                        } else if abort_on_worse_match {
5113                            break;
5114                        }
5115                    }
5116                    prev_max_len = prev_max_len.max(max_match_len);
5117                }
5118                if last_pos + 1 < frontier_buffer_size {
5119                    node_prices[last_pos + 1] = u32::MAX;
5120                }
5121            }
5122        }
5123        while !seed_forced_shortest_path && pos <= last_pos && pos <= frontier_limit {
5124            debug_assert!(pos + 1 < frontier_buffer_size);
5125            let prev_node = unsafe { *nodes.get_unchecked(pos - 1) };
5126            let prev_node_price = unsafe { *node_prices.get_unchecked(pos - 1) };
5127            if prev_node_price != u32::MAX {
5128                let lit_len = prev_node.litlen as usize + 1;
5129                let lit_price = {
5130                    let bt = $self.backend.bt_mut();
5131                    BtMatcher::cached_literal_price(
5132                        profile,
5133                        $stats,
5134                        $current[pos - 1],
5135                        &mut bt.opt_lit_price_scratch,
5136                        &mut bt.opt_lit_price_generation,
5137                        lit_price_stamp,
5138                    )
5139                };
5140                let ll_delta = BtMatcher::cached_lit_length_delta_price(
5141                    profile,
5142                    $stats,
5143                    lit_len,
5144                    &mut ll_cache,
5145                    ll_price_stamp,
5146                );
5147                let lit_cost = BtMatcher::add_price_delta(prev_node_price, lit_price, ll_delta);
5148                // `node_pos_price` is the OLD price at `pos` (before the write
5149                // below) — also the price of `prev_match`, the pre-overwrite copy.
5150                let node_pos_price = unsafe { *node_prices.get_unchecked(pos) };
5151                if lit_cost <= node_pos_price {
5152                    let prev_match = unsafe { *nodes.get_unchecked(pos) };
5153                    let slot = unsafe { nodes.get_unchecked_mut(pos) };
5154                    *slot = prev_node;
5155                    slot.litlen = lit_len as u32;
5156                    node_prices[pos] = lit_cost;
5157                    #[allow(clippy::collapsible_if)]
5158                    if opt_level
5159                        && prev_match.mlen > 0
5160                        && prev_match.litlen == 0
5161                        && pos < $current_len
5162                    {
5163                        if ll1_price < ll0_price {
5164                            let next_lit_price = {
5165                                let bt = $self.backend.bt_mut();
5166                                BtMatcher::cached_literal_price(
5167                                    profile,
5168                                    $stats,
5169                                    $current[pos],
5170                                    &mut bt.opt_lit_price_scratch,
5171                                    &mut bt.opt_lit_price_generation,
5172                                    lit_price_stamp,
5173                                )
5174                            };
5175                            let with1literal = BtMatcher::add_price_delta(
5176                                node_pos_price,
5177                                next_lit_price,
5178                                ll1_price as i32 - ll0_price as i32,
5179                            );
5180                            let ll_delta_next = BtMatcher::cached_lit_length_delta_price(
5181                                profile,
5182                                $stats,
5183                                lit_len + 1,
5184                                &mut ll_cache,
5185                                ll_price_stamp,
5186                            );
5187                            let with_more_literals =
5188                                BtMatcher::add_price_delta(lit_cost, next_lit_price, ll_delta_next);
5189                            let next = pos + 1;
5190                            let next_price = unsafe { *node_prices.get_unchecked(next) };
5191                            if with1literal < with_more_literals && with1literal < next_price {
5192                                // Upstream zstd parity (zstd_opt.c:1232): `cur >= prevMatch.mlen`.
5193                                debug_assert!(pos >= prev_match.mlen as usize);
5194                                let prev_pos = pos - prev_match.mlen as usize;
5195                                {
5196                                    let prev_state = unsafe { *nodes.get_unchecked(prev_pos) };
5197                                    let (_, reps_after_match) = BtMatcher::encode_offset_with_reps(
5198                                        prev_match.off,
5199                                        prev_state.litlen as usize,
5200                                        prev_state.reps,
5201                                    );
5202                                    let slot = unsafe { nodes.get_unchecked_mut(next) };
5203                                    *slot = prev_match;
5204                                    slot.reps = reps_after_match;
5205                                    slot.litlen = 1;
5206                                    node_prices[next] = with1literal;
5207                                    if next > last_pos {
5208                                        last_pos = next;
5209                                    }
5210                                }
5211                            }
5212                        }
5213                    }
5214                }
5215            }
5216
5217            // Memory-resident DP (upstream zstd parity): read opt[cur] fields on
5218            // demand instead of holding a 28-byte node copy live across the
5219            // per-position `$collect` call below. The held copy forced LLVM
5220            // to spill reps[3] + litlen around the (non-inlinable) call;
5221            // reading the fields fresh on each side keeps them out of the
5222            // cross-call live set. `nodes[pos]` is stable across `$collect`
5223            // (it only fills `candidates`), so post-call reads are identical.
5224            let base_cost = unsafe { *node_prices.get_unchecked(pos) };
5225            if base_cost == u32::MAX {
5226                pos += 1;
5227                continue;
5228            }
5229            {
5230                let base_node = unsafe { *nodes.get_unchecked(pos) };
5231                if base_node.mlen > 0 && base_node.litlen == 0 {
5232                    // Upstream zstd parity (zstd_opt.c:1255): `cur >= opt[cur].mlen`.
5233                    debug_assert!(pos >= base_node.mlen as usize);
5234                    let prev_pos = pos - base_node.mlen as usize;
5235                    let prev_state = unsafe { *nodes.get_unchecked(prev_pos) };
5236                    let (_, reps_after_match) = BtMatcher::encode_offset_with_reps(
5237                        base_node.off,
5238                        prev_state.litlen as usize,
5239                        prev_state.reps,
5240                    );
5241                    unsafe { nodes.get_unchecked_mut(pos).reps = reps_after_match };
5242                }
5243            }
5244
5245            if pos + 8 > $current_len {
5246                pos += 1;
5247                continue;
5248            }
5249
5250            if pos == last_pos {
5251                break;
5252            }
5253
5254            let next_price = unsafe { *node_prices.get_unchecked(pos + 1) };
5255            // `saturating_add` is REQUIRED here, not a masked bug: `base_cost`
5256            // is a node price that can be the `u32::MAX` "unreachable" sentinel,
5257            // and saturating keeps `base_cost + margin` pinned at MAX so the
5258            // comparison stays correct. Plain `+` would wrap the sentinel and
5259            // flip the abort decision (a ratio bug / debug overflow panic).
5260            if abort_on_worse_match
5261                && next_price <= base_cost.saturating_add(HC_BITCOST_MULTIPLIER / 2)
5262            {
5263                pos += 1;
5264                continue;
5265            }
5266
5267            let abs_pos = $current_abs_start + pos;
5268            let ldm_candidate = if has_ldm {
5269                $self.backend.bt_mut().ldm_process_match_candidate(
5270                    &mut opt_ldm,
5271                    pos,
5272                    $current_len - pos,
5273                    min_match_len,
5274                )
5275            } else {
5276                None
5277            };
5278            candidates.clear();
5279            // SAFETY: same umbrella as `$collect`. Query fields are read
5280            // fresh here (consumed into the call's argument) so they do not
5281            // stay live across the call; the post-call reads below are a
5282            // separate, fresh load of the same stable `nodes[pos]`.
5283            unsafe {
5284                $self.$collect::<$strategy_ty, true>(
5285                    abs_pos,
5286                    current_abs_end,
5287                    profile,
5288                    HcCandidateQuery {
5289                        reps: nodes.get_unchecked(pos).reps,
5290                        lit_len: nodes.get_unchecked(pos).litlen as usize,
5291                        ldm_candidate,
5292                    },
5293                    &mut candidates,
5294                )
5295            };
5296            // Post-call reads of opt[cur]: fresh, born after `$collect`, so
5297            // never part of the cross-call live set (see memory-resident note
5298            // above). `nodes[pos]` is untouched by `$collect`.
5299            let base_reps = unsafe { nodes.get_unchecked(pos).reps };
5300            let base_litlen = unsafe { nodes.get_unchecked(pos).litlen as usize };
5301            if let Some(candidate) = candidates.last() {
5302                let longest_len = candidate.match_len.min($current_len - pos);
5303                if longest_len > sufficient_len
5304                    || pos + longest_len >= HC_OPT_NUM
5305                    || pos + longest_len >= $current_len
5306                {
5307                    let lit_len = base_litlen;
5308                    let off_base = BtMatcher::encode_offset_base_with_reps(
5309                        candidate.offset as u32,
5310                        lit_len,
5311                        base_reps,
5312                    );
5313                    let off_price = profile
5314                        .offset_price_for::<ACCURATE_PRICE, FAVOR_SMALL_OFFSETS>($stats, off_base);
5315                    let ml_price = BtMatcher::cached_match_length_price(
5316                        profile,
5317                        $stats,
5318                        longest_len,
5319                        &mut ml_cache,
5320                        ml_price_stamp,
5321                    );
5322                    let seq_cost = BtMatcher::add_prices(
5323                        ll0_price,
5324                        profile.match_price_from_parts(off_price, ml_price, $stats),
5325                    );
5326                    let forced_price = BtMatcher::add_prices(base_cost, seq_cost);
5327                    let end_pos = (pos + longest_len).min($current_len);
5328                    forced_end = Some(end_pos);
5329                    forced_end_state = Some(HcOptimalNode {
5330                        off: candidate.offset as u32,
5331                        mlen: longest_len as u32,
5332                        litlen: 0,
5333                        reps: base_reps,
5334                    });
5335                    forced_end_price = Some(forced_price);
5336                    break;
5337                }
5338            }
5339            let mut prev_max_len = min_match_len - 1;
5340            for candidate in candidates.iter() {
5341                // Outer loop guards `pos <= frontier_limit` (see the
5342                // `while ... pos <= frontier_limit` condition); the
5343                // subtraction below is therefore safe.
5344                debug_assert!(pos <= frontier_limit);
5345                let max_match_len = candidate
5346                    .match_len
5347                    .min($current_len - pos)
5348                    .min(frontier_limit - pos);
5349                let min_len = min_match_len;
5350                if max_match_len < min_len {
5351                    continue;
5352                }
5353                let start_len = (prev_max_len + 1).max(min_len);
5354                if start_len > max_match_len {
5355                    prev_max_len = prev_max_len.max(max_match_len);
5356                    continue;
5357                }
5358                let max_next = pos + max_match_len;
5359                if max_next > last_pos {
5360                    BtMatcher::reset_opt_nodes(
5361                        &mut nodes,
5362                        &mut node_prices,
5363                        last_pos + 1,
5364                        max_next,
5365                    );
5366                }
5367                let lit_len = base_litlen;
5368                let off_base = BtMatcher::encode_offset_base_with_reps(
5369                    candidate.offset as u32,
5370                    lit_len,
5371                    base_reps,
5372                );
5373                let off_price = profile
5374                    .offset_price_for::<ACCURATE_PRICE, FAVOR_SMALL_OFFSETS>($stats, off_base);
5375                debug_assert!(pos + max_match_len < frontier_buffer_size);
5376                if abort_on_worse_match {
5377                    // btopt (OPT_LEVEL == 0): reverse-iterate with early break —
5378                    // once a longer match stops improving, shorter ones are
5379                    // skipped. Order-dependent, stays scalar.
5380                    for match_len in (start_len..=max_match_len).rev() {
5381                        let next = pos + match_len;
5382                        let ml_price = BtMatcher::cached_match_length_price(
5383                            profile,
5384                            $stats,
5385                            match_len,
5386                            &mut ml_cache,
5387                            ml_price_stamp,
5388                        );
5389                        let seq_cost = BtMatcher::add_prices(
5390                            ll0_price,
5391                            profile.match_price_from_parts(off_price, ml_price, $stats),
5392                        );
5393                        let next_cost = BtMatcher::add_prices(base_cost, seq_cost);
5394                        let node_next_price = unsafe { *node_prices.get_unchecked(next) };
5395                        if next > last_pos || next_cost < node_next_price {
5396                            let slot = unsafe { nodes.get_unchecked_mut(next) };
5397                            *slot = HcOptimalNode {
5398                                off: candidate.offset as u32,
5399                                mlen: match_len as u32,
5400                                litlen: 0,
5401                                reps: base_reps,
5402                            };
5403                            unsafe { *node_prices.get_unchecked_mut(next) = next_cost };
5404                            if next > last_pos {
5405                                last_pos = next;
5406                            }
5407                        } else {
5408                            break;
5409                        }
5410                    }
5411                } else {
5412                    // btultra / btultra2 (OPT_LEVEL >= 2): no abort, each
5413                    // match_len writes a distinct node => order-independent.
5414                    // Dispatch to the per-tier price-set ($priceset is the
5415                    // tier's fn: AVX2 SoA-vector compare for the avx2 wrapper,
5416                    // inline scalar otherwise) — it folds into this wrapper's
5417                    // monomorphisation, so no call ABI / runtime feature check.
5418                    #[allow(unused_unsafe)]
5419                    {
5420                        last_pos = last_pos.max(unsafe {
5421                            $priceset(
5422                                &mut node_prices,
5423                                &mut nodes,
5424                                ml_cache,
5425                                ml_price_stamp,
5426                                profile,
5427                                $stats,
5428                                pos,
5429                                start_len,
5430                                max_match_len,
5431                                ll0_price,
5432                                off_price,
5433                                base_cost,
5434                                candidate.offset as u32,
5435                                base_reps,
5436                                last_pos,
5437                            )
5438                        });
5439                    }
5440                }
5441                prev_max_len = prev_max_len.max(max_match_len);
5442            }
5443
5444            if last_pos + 1 < frontier_buffer_size {
5445                unsafe {
5446                    *node_prices.get_unchecked_mut(last_pos + 1) = u32::MAX;
5447                }
5448            }
5449            pos += 1;
5450        }
5451
5452        if last_pos == 0 {
5453            if $current_len == 0 {
5454                let price = node_prices[0];
5455                return $self.backend.bt_mut().finish_optimal_plan(
5456                    HcOptimalPlanBuffers {
5457                        nodes,
5458                        node_prices,
5459                        candidates,
5460                        store,
5461                        price_arena,
5462                    },
5463                    (price, initial_reps, initial_litlen, 0),
5464                );
5465            }
5466            let lit_price = {
5467                let bt = $self.backend.bt_mut();
5468                BtMatcher::cached_literal_price(
5469                    profile,
5470                    $stats,
5471                    $current[0],
5472                    &mut bt.opt_lit_price_scratch,
5473                    &mut bt.opt_lit_price_generation,
5474                    lit_price_stamp,
5475                )
5476            };
5477            // `initial_litlen` is carried across optimal-plan segments;
5478            // its real bound is the current block length, not
5479            // `current_len`. On i686 (32-bit `usize`) `+ 1` could
5480            // theoretically wrap if the invariant ever broke. Catch
5481            // that explicitly via `checked_add` rather than letting a
5482            // wrapping sum slip into the price lookup.
5483            let next_litlen = initial_litlen
5484                .checked_add(1)
5485                .expect("optimal parser next litlen out of usize range");
5486            let ll_delta = BtMatcher::cached_lit_length_delta_price(
5487                profile,
5488                $stats,
5489                next_litlen,
5490                &mut ll_cache,
5491                ll_price_stamp,
5492            );
5493            let price = BtMatcher::add_price_delta(node_prices[0], lit_price, ll_delta);
5494            return $self.backend.bt_mut().finish_optimal_plan(
5495                HcOptimalPlanBuffers {
5496                    nodes,
5497                    node_prices,
5498                    candidates,
5499                    store,
5500                    price_arena,
5501                },
5502                (price, initial_reps, next_litlen, 1),
5503            );
5504        }
5505
5506        let target_pos = forced_end.unwrap_or(last_pos.min(frontier_limit));
5507        // Price lives in `node_prices`, not the node struct, so carry the
5508        // final-stretch price alongside its node (forced-seed companion or the
5509        // frontier price at `target_pos`).
5510        let (last_stretch, last_stretch_price) = if let Some(forced_state) = forced_end_state {
5511            (forced_state, forced_end_price.expect("forced state has a price"))
5512        } else {
5513            (nodes[target_pos], node_prices[target_pos])
5514        };
5515        if last_stretch_price == u32::MAX {
5516            return $self.backend.bt_mut().finish_optimal_plan(
5517                HcOptimalPlanBuffers {
5518                    nodes,
5519                    node_prices,
5520                    candidates,
5521                    store,
5522                    price_arena,
5523                },
5524                (u32::MAX, initial_reps, initial_litlen, $current_len),
5525            );
5526        }
5527
5528        if last_stretch.mlen == 0 {
5529            return $self.backend.bt_mut().finish_optimal_plan(
5530                HcOptimalPlanBuffers {
5531                    nodes,
5532                    node_prices,
5533                    candidates,
5534                    store,
5535                    price_arena,
5536                },
5537                (
5538                    last_stretch_price,
5539                    last_stretch.reps,
5540                    last_stretch.litlen as usize,
5541                    target_pos.min($current_len),
5542                ),
5543            );
5544        }
5545
5546        let mut cur = target_pos.saturating_sub(last_stretch.mlen as usize);
5547        let end_reps = if last_stretch.litlen == 0 {
5548            let prev_state = nodes[cur];
5549            let (_, reps_after_match) = BtMatcher::encode_offset_with_reps(
5550                last_stretch.off,
5551                prev_state.litlen as usize,
5552                prev_state.reps,
5553            );
5554            reps_after_match
5555        } else {
5556            let tail_literals = last_stretch.litlen as usize;
5557            if cur < tail_literals {
5558                return $self.backend.bt_mut().finish_optimal_plan(
5559                    HcOptimalPlanBuffers {
5560                        nodes,
5561                        node_prices,
5562                        candidates,
5563                        store,
5564                        price_arena,
5565                    },
5566                    (
5567                        last_stretch_price,
5568                        last_stretch.reps,
5569                        tail_literals,
5570                        target_pos.min($current_len),
5571                    ),
5572                );
5573            }
5574            cur -= tail_literals;
5575            last_stretch.reps
5576        };
5577        let store_end = cur + 2;
5578        if store.len() <= store_end {
5579            store.resize(store_end + 1, HcOptimalNode::default());
5580        }
5581        let mut store_start;
5582        let mut stretch_pos = cur;
5583
5584        if last_stretch.litlen > 0 {
5585            store[store_end] = HcOptimalNode {
5586                litlen: last_stretch.litlen,
5587                mlen: 0,
5588                ..HcOptimalNode::default()
5589            };
5590            store_start = store_end.saturating_sub(1);
5591            store[store_start] = last_stretch;
5592        }
5593        store[store_end] = last_stretch;
5594        store_start = store_end;
5595
5596        loop {
5597            let next_stretch = nodes[stretch_pos];
5598            store[store_start].litlen = next_stretch.litlen;
5599            if next_stretch.mlen == 0 {
5600                break;
5601            }
5602            if store_start == 0 {
5603                break;
5604            }
5605            store_start -= 1;
5606            store[store_start] = next_stretch;
5607            // Parser invariant: every emitted stretch is bounded by the
5608            // current block, so `litlen + mlen <= current_len <=
5609            // HC_BLOCKSIZE_MAX (128 KiB)`. The `as usize` widening + raw
5610            // `+` is safe on 32-bit targets — two u32 values do NOT
5611            // automatically fit in `usize` on i686, the block bound is
5612            // what makes this addition safe.
5613            let litlen = next_stretch.litlen as usize;
5614            let mlen = next_stretch.mlen as usize;
5615            debug_assert!(litlen + mlen <= $current_len);
5616            let step = litlen + mlen;
5617            if step == 0 || stretch_pos < step {
5618                break;
5619            }
5620            stretch_pos -= step;
5621        }
5622
5623        let mut tail_literals = initial_litlen;
5624        let mut store_pos = store_start;
5625        while store_pos <= store_end {
5626            let stretch = store[store_pos];
5627            let llen = stretch.litlen as usize;
5628            let mlen = stretch.mlen as usize;
5629            if mlen == 0 {
5630                tail_literals = llen;
5631                store_pos += 1;
5632                continue;
5633            }
5634            $out.push(HcOptimalSequence {
5635                offset: stretch.off,
5636                match_len: mlen as u32,
5637                lit_len: llen as u32,
5638            });
5639            tail_literals = 0;
5640            store_pos += 1;
5641        }
5642        let result = (
5643            last_stretch_price,
5644            end_reps,
5645            if last_stretch.litlen > 0 {
5646                last_stretch.litlen as usize
5647            } else {
5648                tail_literals
5649            },
5650            target_pos.min($current_len),
5651        );
5652        $self.backend.bt_mut().finish_optimal_plan(
5653            HcOptimalPlanBuffers {
5654                nodes,
5655                node_prices,
5656                candidates,
5657                store,
5658                price_arena,
5659            },
5660            result,
5661        )
5662    }};
5663}
5664
5665/// `collect_optimal_candidates_initialized` body parameterized over the per-CPU
5666/// kernel: the `$cpl` path is the kernel's `common_prefix_len_ptr` (used in
5667/// the HC chain walk fallback), and the four method-name substitutions
5668/// (`$bt_update`, `$bt_insert`, `$for_each_rep`, `$hash3`) route to the
5669/// kernel-specific wrappers of the inner helpers. With every helper under
5670/// the same `target_feature` umbrella, the entire per-position pipeline
5671/// (BT-tree fill + rep probing + hash3 probing + BT match collection /
5672/// HC chain walk) inlines without ABI barriers on the level22 hot path.
5673macro_rules! collect_optimal_candidates_initialized_body {
5674    (
5675        $self:expr,
5676        $strategy_ty:ty,
5677        $abs_pos:ident,
5678        $current_abs_end:ident,
5679        $profile:ident,
5680        $query:ident,
5681        $out:ident,
5682        $bt_matchfinder:ident,
5683        $bt_update:ident,
5684        $bt_insert:ident,
5685        $for_each_rep:ident,
5686        $hash3:ident,
5687        $cpl:path $(,)?
5688    ) => {{
5689        // Per-strategy compile-time const: only BtUltra2 drives the
5690        // hash3 short-match table. All other monomorphisations drop
5691        // the entire hash3 lookup block at codegen time. The relaxed
5692        // implication enforces only the direction we depend on:
5693        // if the strategy declares hash3, the table must be live.
5694        // The reverse (`hash3_log != 0` without `USE_HASH3`) is OK —
5695        // a future caller may pre-allocate hash3 storage without
5696        // wiring the BtUltra2 path through.
5697        let use_hash3: bool = <$strategy_ty as super::strategy::Strategy>::USE_HASH3;
5698        debug_assert!(!$self.table.hash_table.is_empty());
5699        debug_assert!($self.table.hash3_log == 0 || !$self.table.hash3_table.is_empty());
5700        debug_assert!(
5701            !use_hash3 || $self.table.hash3_log != 0,
5702            "Strategy::USE_HASH3 = true but runtime hash3_log is 0 — call configure() first",
5703        );
5704        debug_assert!(!$self.table.chain_table.is_empty());
5705        let min_match_len = HC_OPT_MIN_MATCH_LEN;
5706        let reps = $query.reps;
5707        let lit_len = $query.lit_len;
5708        let ldm_candidate = $query.ldm_candidate;
5709        $out.clear();
5710        if $abs_pos < $self.table.skip_insert_until_abs {
5711            if let Some(ldm) = ldm_candidate {
5712                let mut best_len_for_skip = 0usize;
5713                let _ = super::bt::BtMatcher::push_candidate_ladder(
5714                    $out,
5715                    &mut best_len_for_skip,
5716                    ldm,
5717                    min_match_len,
5718                );
5719            }
5720            return;
5721        }
5722        if $bt_matchfinder {
5723            // SAFETY: caller is in the same target_feature umbrella as
5724            // `$bt_update`; the runtime kernel detector already gated entry.
5725            unsafe { $self.table.$bt_update($abs_pos, $current_abs_end) };
5726        }
5727        let current_idx = $abs_pos - $self.table.history_abs_start;
5728        if current_idx + 4 > $self.table.live_history().len() {
5729            if let Some(ldm) = ldm_candidate {
5730                let mut best_len_for_skip = 0usize;
5731                let _ = super::bt::BtMatcher::push_candidate_ladder(
5732                    $out,
5733                    &mut best_len_for_skip,
5734                    ldm,
5735                    min_match_len,
5736                );
5737            }
5738            return;
5739        }
5740        let mut best_len_for_skip = 0usize;
5741        let mut skip_further_match_search = false;
5742        let mut rep_len_candidate_found = false;
5743        // SAFETY: same umbrella; closure capture is monomorphized per call.
5744        unsafe {
5745            $self.hc.$for_each_rep(
5746                &$self.table,
5747                $abs_pos,
5748                lit_len,
5749                reps,
5750                $current_abs_end,
5751                min_match_len,
5752                |rep| {
5753                    if rep.match_len >= min_match_len {
5754                        rep_len_candidate_found = true;
5755                    }
5756                    let _ = super::bt::BtMatcher::push_candidate_ladder(
5757                        $out,
5758                        &mut best_len_for_skip,
5759                        rep,
5760                        min_match_len,
5761                    );
5762                    if rep.match_len > $profile.sufficient_match_len {
5763                        skip_further_match_search = true;
5764                    }
5765                    // `for_each_repcode_candidate_with_reps` caps
5766                    // `rep.match_len` at the per-call `tail_limit =
5767                    // current_abs_end - abs_pos`, so `abs_pos +
5768                    // rep.match_len <= current_abs_end`. The raw sum
5769                    // therefore stays in `usize` on every supported
5770                    // target.
5771                    if $abs_pos + rep.match_len >= $current_abs_end {
5772                        skip_further_match_search = true;
5773                    }
5774                },
5775            )
5776        };
5777        // Hash3 lookup runs only when the strategy enables it. The
5778        // `use_hash3` binding above is a per-monomorphisation const,
5779        // so non-BtUltra2 instances drop this entire block.
5780        if use_hash3 && !skip_further_match_search && best_len_for_skip < min_match_len {
5781            $self.table.update_hash3_until($abs_pos);
5782            // SAFETY: same umbrella for hash3_candidate.
5783            if let Some(h3) = unsafe {
5784                $self
5785                    .table
5786                    .$hash3($abs_pos, $current_abs_end, min_match_len)
5787            } {
5788                let _ = super::bt::BtMatcher::push_candidate_ladder(
5789                    $out,
5790                    &mut best_len_for_skip,
5791                    h3,
5792                    min_match_len,
5793                );
5794                if !rep_len_candidate_found
5795                    && (h3.match_len > $profile.sufficient_match_len
5796                        || $abs_pos + h3.match_len >= $current_abs_end)
5797                {
5798                    $self.table.skip_insert_until_abs = $abs_pos + 1;
5799                    skip_further_match_search = true;
5800                }
5801            }
5802        }
5803        if !skip_further_match_search && $bt_matchfinder {
5804            // SAFETY: same umbrella for bt_insert_and_collect_matches.
5805            unsafe {
5806                $self.table.$bt_insert(
5807                    $abs_pos,
5808                    $current_abs_end,
5809                    $profile,
5810                    min_match_len,
5811                    &mut best_len_for_skip,
5812                    $out,
5813                )
5814            };
5815        } else if !skip_further_match_search {
5816            $self.table.insert_position($abs_pos);
5817            let max_chain_depth = $profile.max_chain_depth.min($self.hc.search_depth);
5818            let concat = $self.table.live_history();
5819            // Raw `+ 9` is safe here — see `bt_insert_step_no_rebase_body!`
5820            // for the full discussion of the upstream `STREAM_ABS_HEADROOM`
5821            // cap in `MatchTable::add_data`.
5822            let mut match_end_abs = $abs_pos + 9;
5823            if max_chain_depth > 0 {
5824                for (visited, candidate_abs) in $self
5825                    .hc
5826                    .chain_candidates(&$self.table, $abs_pos)
5827                    .into_iter()
5828                    .enumerate()
5829                {
5830                    if visited >= max_chain_depth {
5831                        break;
5832                    }
5833                    if candidate_abs == usize::MAX {
5834                        break;
5835                    }
5836                    if candidate_abs < $self.table.window_low_abs_for_target($abs_pos)
5837                        || candidate_abs >= $abs_pos
5838                    {
5839                        continue;
5840                    }
5841                    let candidate_idx = candidate_abs - $self.table.history_abs_start;
5842                    debug_assert!(
5843                        $abs_pos <= $current_abs_end,
5844                        "HC chain walker called past current block end"
5845                    );
5846                    let tail_limit = $current_abs_end - $abs_pos;
5847                    let base = concat.as_ptr();
5848                    // SAFETY: history-relative indices; `tail_limit` bounds
5849                    // the scan within `concat`. `$cpl` is the kernel-specific
5850                    // common_prefix_len_ptr — call inlines because the
5851                    // surrounding wrapper carries the same target_feature.
5852                    let match_len =
5853                        unsafe { $cpl(base.add(candidate_idx), base.add(current_idx), tail_limit) };
5854                    if match_len < min_match_len {
5855                        continue;
5856                    }
5857                    let offset = $abs_pos - candidate_abs;
5858                    if super::bt::BtMatcher::push_candidate_ladder(
5859                        $out,
5860                        &mut best_len_for_skip,
5861                        MatchCandidate {
5862                            start: $abs_pos,
5863                            offset,
5864                            match_len,
5865                        },
5866                        min_match_len,
5867                    ) {
5868                        let candidate_end = candidate_abs + match_len;
5869                        if candidate_end > match_end_abs {
5870                            match_end_abs = candidate_end;
5871                        }
5872                    }
5873                    if match_len > HC_OPT_NUM || $abs_pos + match_len >= $current_abs_end {
5874                        break;
5875                    }
5876                }
5877            }
5878            // `match_end_abs` initialized to `abs_pos + 9`; monotonic
5879            // updates only ever extend it, so `match_end_abs - 8 >= 1`.
5880            $self.table.skip_insert_until_abs =
5881                $self.table.skip_insert_until_abs.max(match_end_abs - 8);
5882        }
5883        if let Some(ldm) = ldm_candidate {
5884            let _ = super::bt::BtMatcher::push_candidate_ladder(
5885                $out,
5886                &mut best_len_for_skip,
5887                ldm,
5888                min_match_len,
5889            );
5890        }
5891    }};
5892}
5893
5894/// `hash3_candidate` body parameterized over the per-CPU
5895/// `common_prefix_len_ptr` symbol. The hash3 probe checks one candidate per
5896/// position when invoked, so the per-call ABI savings compound across the
5897/// segment. Crate-private (see `bt_insert_step_no_rebase_body!`).
5898macro_rules! hash3_candidate_body {
5899    (
5900        $table:expr,
5901        $abs_pos:ident,
5902        $current_abs_end:ident,
5903        $min_match_len:ident,
5904        $cpl:path $(,)?
5905    ) => {{
5906        if $table.hash3_log == 0 {
5907            return None;
5908        }
5909        let idx = $abs_pos.checked_sub($table.history_abs_start)?;
5910        let concat = $table.live_history();
5911        if idx + 4 > concat.len() {
5912            return None;
5913        }
5914        let hash3 = $crate::encoding::match_table::storage::MatchTable::hash_position_at(
5915            concat,
5916            idx,
5917            $table.hash3_log,
5918            3,
5919        );
5920        let entry = $table
5921            .hash3_table
5922            .get(hash3)
5923            .copied()
5924            .unwrap_or($crate::encoding::match_table::storage::HC_EMPTY);
5925        let candidate_abs =
5926            $crate::encoding::match_table::storage::MatchTable::stored_abs_position_fast(
5927                entry,
5928                $table.position_base,
5929                $table.index_shift,
5930            )?;
5931        if candidate_abs < $table.history_abs_start || candidate_abs >= $abs_pos {
5932            return None;
5933        }
5934        let offset = $abs_pos - candidate_abs;
5935        if offset >= $crate::encoding::bt::HC3_MAX_OFFSET {
5936            return None;
5937        }
5938        let candidate_idx = candidate_abs - $table.history_abs_start;
5939        let tail_limit = $current_abs_end.saturating_sub($abs_pos);
5940        let base = concat.as_ptr();
5941        // SAFETY: candidate/idx are within history range; tail_limit
5942        // bounds the scan within `concat`.
5943        let match_len = unsafe { $cpl(base.add(candidate_idx), base.add(idx), tail_limit) };
5944        (match_len >= $min_match_len).then_some($crate::encoding::opt::types::MatchCandidate {
5945            start: $abs_pos,
5946            offset,
5947            match_len,
5948        })
5949    }};
5950}
5951pub(crate) use hash3_candidate_body;
5952
5953/// `for_each_repcode_candidate_with_reps` body parameterized over the per-CPU
5954/// `common_prefix_len_ptr` symbol so the per-rep prefix probe inlines under
5955/// the wrapper's `target_feature` umbrella instead of crossing the ABI
5956/// boundary through the dispatcher. Three rep probes per encoded position →
5957/// thousands per segment, so the per-call barrier was non-trivial.
5958///
5959/// The callback `f` runs in the wrapper's umbrella context too, so closures
5960/// that capture mutable state still work (FnMut). Crate-private
5961/// (see `bt_insert_step_no_rebase_body!`).
5962macro_rules! for_each_repcode_candidate_body {
5963    (
5964        $table:expr,
5965        $abs_pos:ident,
5966        $lit_len:ident,
5967        $reps:ident,
5968        $current_abs_end:ident,
5969        $min_match_len:ident,
5970        $f:ident,
5971        $cpl:path $(,)?
5972    ) => {{
5973        let rep_offsets: [Option<usize>; 3] = if $lit_len == 0 {
5974            [
5975                Some($reps[1] as usize),
5976                Some($reps[2] as usize),
5977                ($reps[0] > 1).then_some(($reps[0] - 1) as usize),
5978            ]
5979        } else {
5980            [
5981                Some($reps[0] as usize),
5982                Some($reps[1] as usize),
5983                Some($reps[2] as usize),
5984            ]
5985        };
5986        let concat = $table.live_history();
5987        let current_idx = $abs_pos - $table.history_abs_start;
5988        if current_idx + 4 > concat.len() {
5989            return;
5990        }
5991        let tail_limit = $current_abs_end.saturating_sub($abs_pos);
5992        let base = concat.as_ptr();
5993        let concat_len = concat.len();
5994        for rep in rep_offsets.into_iter().flatten() {
5995            if rep == 0 || rep > $abs_pos {
5996                continue;
5997            }
5998            let candidate_pos = $abs_pos - rep;
5999            if candidate_pos < $table.history_abs_start {
6000                continue;
6001            }
6002            let candidate_idx = candidate_pos - $table.history_abs_start;
6003            // Upstream zstd `ZSTD_readMINMATCH` gate (zstd_opt.c:657-674): a
6004            // 4-byte (3-byte when min_match_len == 3) equality probe
6005            // before the full prefix scan. Equivalent filtering — a
6006            // mismatch here means `match_len < min_match_len`, which
6007            // the post-scan check rejects anyway — but it skips the
6008            // prefix-kernel call for the common no-match case (rep
6009            // offsets rarely hit on low-redundancy input).
6010            //
6011            // SAFETY: `current_idx + 4 <= concat_len` (early return
6012            // above) and `candidate_idx < current_idx` (rep >= 1), so
6013            // both 4-byte reads stay inside `concat`.
6014            let gate_matches = unsafe {
6015                let cand = base.add(candidate_idx).cast::<u32>().read_unaligned();
6016                let cur = base.add(current_idx).cast::<u32>().read_unaligned();
6017                if $min_match_len == 3 {
6018                    // Compare the low-address 3 bytes regardless of
6019                    // endianness: byte-shift on LE, mask via to_le.
6020                    (cand.to_le() & 0x00FF_FFFF) == (cur.to_le() & 0x00FF_FFFF)
6021                } else {
6022                    cand == cur
6023                }
6024            };
6025            if !gate_matches {
6026                continue;
6027            }
6028            // SAFETY: `candidate_idx ≤ current_idx < concat_len` (since
6029            // candidate_pos ≤ abs_pos and we early-returned on
6030            // `current_idx + 4 > concat_len`). `max` clamps to the shorter
6031            // remaining run so neither pointer overruns `concat`.
6032            let max = (concat_len - candidate_idx)
6033                .min(concat_len - current_idx)
6034                .min(tail_limit);
6035            let match_len = unsafe { $cpl(base.add(candidate_idx), base.add(current_idx), max) };
6036            if match_len < $min_match_len {
6037                continue;
6038            }
6039            $f(MatchCandidate {
6040                start: $abs_pos,
6041                offset: rep,
6042                match_len,
6043            });
6044        }
6045    }};
6046}
6047pub(crate) use for_each_repcode_candidate_body;
6048
6049/// `bt_insert_and_collect_matches` body parameterized over the per-CPU
6050/// `count_match_from_indices` symbol. Same shape as
6051/// [`bt_insert_step_no_rebase_body`] — picks up the matching kernel through
6052/// `$cmf` so the per-iteration vector probe inlines under the wrapper's
6053/// `target_feature` umbrella. Returns nothing (matches the original method).
6054/// Crate-private (see `bt_insert_step_no_rebase_body!`).
6055macro_rules! bt_insert_and_collect_matches_body {
6056    (
6057        $table:expr,
6058        $search_depth:expr,
6059        $abs_pos:ident,
6060        $current_abs_end:ident,
6061        $profile:ident,
6062        $min_match_len:ident,
6063        $best_len_for_skip:ident,
6064        $out:ident,
6065        $cmf:path $(,)?
6066    ) => {{
6067        let idx = $abs_pos - $table.history_abs_start;
6068        // Borrowed-aware live region (owned: `history[history_start..]`;
6069        // borrowed: the in-place input `[0, block_end)`). Reborrow-then-raw-ptr
6070        // so the slice holds NO borrow and coexists with the `&mut $table`
6071        // binary-tree writes below. Owned is byte-identical (same bytes).
6072        let concat: &[u8] = unsafe {
6073            let lh = $table.live_history();
6074            core::slice::from_raw_parts(lh.as_ptr(), lh.len())
6075        };
6076        if idx + 8 > concat.len() {
6077            return;
6078        }
6079        debug_assert!(
6080            $abs_pos <= $current_abs_end,
6081            "BT collect called past current block end"
6082        );
6083        let tail_limit = $current_abs_end - $abs_pos;
6084        let hash = $crate::encoding::match_table::storage::MatchTable::hash_position_at(
6085            concat,
6086            idx,
6087            $table.hash_log,
6088            $table.search_mls,
6089        );
6090        // Prefetch the hash bucket now. For the large L16+ hash table over
6091        // high-entropy input the bucket is L3/DRAM-cold, and unlike upstream's
6092        // monolithic ZSTD_btGetAllMatches (which overlaps this miss with its
6093        // inline rep/hash3 prologue) the read+write of `hash_table[hash]`
6094        // below is reached with nothing to hide it behind — it stalled a large
6095        // share of this function's cycles. Issuing the hint here lets the miss
6096        // overlap the address setup that follows.
6097        #[cfg(all(
6098            target_feature = "sse",
6099            any(target_arch = "x86", target_arch = "x86_64")
6100        ))]
6101        {
6102            #[cfg(target_arch = "x86")]
6103            use core::arch::x86::{_MM_HINT_T0, _mm_prefetch};
6104            #[cfg(target_arch = "x86_64")]
6105            use core::arch::x86_64::{_MM_HINT_T0, _mm_prefetch};
6106            // SAFETY: prefetch is a hint that never faults; `hash` indexes
6107            // `hash_table` directly below, so it is in bounds.
6108            unsafe {
6109                _mm_prefetch($table.hash_table.as_ptr().add(hash).cast(), _MM_HINT_T0);
6110            }
6111            // Prefetch the NEXT position's bucket too. The optimal-parser DP
6112            // advances one position per iteration, so this miss is issued a
6113            // full BT walk plus the next iteration's pre-collect work ahead of
6114            // the collect that will read it — far more lead than the same-call
6115            // hint above, enough to hide the full DRAM latency.
6116            if idx + 1 + 8 <= concat.len() {
6117                let hash_next =
6118                    $crate::encoding::match_table::storage::MatchTable::hash_position_at(
6119                        concat,
6120                        idx + 1,
6121                        $table.hash_log,
6122                        $table.search_mls,
6123                    );
6124                // SAFETY: prefetch never faults; an out-of-range index is a
6125                // harmless no-op hint.
6126                unsafe {
6127                    _mm_prefetch(
6128                        $table.hash_table.as_ptr().add(hash_next).cast(),
6129                        _MM_HINT_T0,
6130                    );
6131                }
6132            }
6133        }
6134        let Some(relative_pos) = $table.relative_position($abs_pos) else {
6135            return;
6136        };
6137        let stored = relative_pos + 1;
6138        let bt_mask = $table.bt_mask();
6139        // Hoist the BT pointer-pair table's base out of `self` once: every
6140        // access below is `chain_table[computed_index]` through `&mut self`,
6141        // which the optimizer cannot prove loop-invariant, so it reloads the
6142        // Vec's (ptr,len) from the struct AND bounds-checks on every tree
6143        // step (the upstream zstd walks a raw `U32* btable`, zstd_opt.c). The raw
6144        // base carries no borrow, so the `&self` helper calls in the loop
6145        // (`bt_pair_index_for_abs`, `window_low_abs_for_target`,
6146        // `relative_position`) coexist — they read other fields, never
6147        // `chain_table`. Indices are in bounds by the BT invariants:
6148        // `bt_pair_index_for_abs` returns `2*(abs & bt_mask) (+1)` ≤
6149        // `chain_table.len()-1`, and the slots only ever hold those values.
6150        let chain_ptr = $table.chain_table.as_mut_ptr();
6151        debug_assert_eq!($table.chain_table.len(), 2 << $table.bt_log());
6152        // See `bt_insert_step_no_rebase_body!`: saturating is needed for the
6153        // first BT walk of a fresh frame where `abs_pos < bt_mask`.
6154        let bt_low = $abs_pos.saturating_sub(bt_mask);
6155        let window_low = $table.window_low_abs_for_target($abs_pos);
6156        // Upstream zstd-style window bound in stored space so the BT-walk loop
6157        // condition rejects out-of-window / HC_EMPTY candidates WITHOUT
6158        // decoding them (mirrors upstream `while ... matchIndex >= matchLow`):
6159        // one range check on `match_stored` instead of decode-then-break,
6160        // dropping the wasted candidate_abs decode on every walk's terminating
6161        // step. candidate_abs(s) = (position_base + s - 1) - index_shift =
6162        // base + s (wrapping); in-window ⟺ candidate_abs - window_low <
6163        // abs_pos - window_low ⟺ s.wrapping_add(win_off) < win_range.
6164        // HC_EMPTY (s = 0) maps to base = (lowest representable abs) - 1 <
6165        // window_low, so it falls out of range and ends the walk.
6166        let win_off = $table
6167            .position_base
6168            .wrapping_sub(1)
6169            .wrapping_sub($table.index_shift)
6170            .wrapping_sub(window_low);
6171        let win_range = $abs_pos - window_low;
6172        // Raw `+ 9` is safe here — see `bt_insert_step_no_rebase_body!`
6173        // for the full discussion of the upstream `STREAM_ABS_HEADROOM`
6174        // cap in `MatchTable::add_data`.
6175        let mut match_end_abs = $abs_pos + 9;
6176        let mut compares_left = $profile.max_chain_depth.min($search_depth);
6177        let mut common_length_smaller = 0usize;
6178        let mut common_length_larger = 0usize;
6179        let pair_idx = $table.bt_pair_index_for_abs($abs_pos);
6180        let mut smaller_slot = pair_idx;
6181        let mut larger_slot = pair_idx + 1;
6182        let mut match_stored = $table.hash_table[hash];
6183        $table.hash_table[hash] = stored;
6184        // Upstream zstd semantics: `bestLength` starts at `lengthToBeat - 1`; rep/hash3
6185        // probing may raise it; BT then only reports strictly longer matches.
6186        // `min_match_len >= HC_FORMAT_MINMATCH (3)` by configure invariant,
6187        // so `min_match_len - 1 >= 2` cannot underflow.
6188        debug_assert!(
6189            $min_match_len >= $crate::encoding::cost_model::HC_FORMAT_MINMATCH,
6190            "min_match_len must be at least HC_FORMAT_MINMATCH"
6191        );
6192        let mut best_len = (*$best_len_for_skip).max($min_match_len - 1);
6193
6194        // Upstream zstd-form loop condition: the stored-space window range check
6195        // (`s.wrapping_add(win_off) < win_range`) rejects out-of-window and
6196        // HC_EMPTY candidates here, so the terminating step never enters the
6197        // body — no wasted candidate_abs decode, matching upstream's
6198        // `while ... matchIndex >= matchLow`.
6199        while compares_left > 0 && (match_stored as usize).wrapping_add(win_off) < win_range {
6200            compares_left -= 1;
6201            // The condition proved this candidate is in `[window_low,
6202            // abs_pos)`, so `match_stored >= 1` (HC_EMPTY is out of range) and
6203            // the `- 1` cannot underflow; candidate_abs == base + match_stored.
6204            let candidate_abs = ($table.position_base + (match_stored as usize - 1))
6205                .wrapping_sub($table.index_shift);
6206
6207            let next_pair_idx = $table.bt_pair_index_for_abs(candidate_abs);
6208            // SAFETY: `next_pair_idx (+1)` = `2*(candidate_abs & bt_mask) (+1)`
6209            // ≤ `chain_table.len()-1`; `chain_ptr` is the hoisted live base,
6210            // table not realloc'd during the walk.
6211            let next_smaller = unsafe { *chain_ptr.add(next_pair_idx) };
6212            let next_larger = unsafe { *chain_ptr.add(next_pair_idx + 1) };
6213            let seed_len = common_length_smaller.min(common_length_larger);
6214            let candidate_idx = candidate_abs - $table.history_abs_start;
6215            // SAFETY: BT walk invariant — `candidate_idx + tail_limit ≤
6216            // concat.len()`.
6217            let match_len = unsafe { $cmf(concat, idx, candidate_idx, tail_limit, seed_len) };
6218
6219            if match_len > best_len {
6220                let offset = $abs_pos - candidate_abs;
6221                let accepted = $crate::encoding::bt::BtMatcher::push_candidate_ladder(
6222                    $out,
6223                    $best_len_for_skip,
6224                    $crate::encoding::opt::types::MatchCandidate {
6225                        start: $abs_pos,
6226                        offset,
6227                        match_len,
6228                    },
6229                    $min_match_len,
6230                );
6231                if accepted {
6232                    best_len = match_len;
6233                    // BT walker invariants: `candidate_abs < abs_pos`
6234                    // and `match_len <= tail_limit = current_abs_end -
6235                    // abs_pos`. So `candidate_abs + match_len <
6236                    // abs_pos + tail_limit = current_abs_end`, which
6237                    // fits in `usize` on every supported target (32-bit
6238                    // i686 included) — the addition stays within the
6239                    // current block.
6240                    let candidate_end = candidate_abs + match_len;
6241                    if candidate_end > match_end_abs {
6242                        match_end_abs = candidate_end;
6243                    }
6244                    if match_len >= tail_limit
6245                        || match_len > $crate::encoding::cost_model::HC_OPT_NUM
6246                    {
6247                        break;
6248                    }
6249                }
6250            }
6251
6252            if match_len >= tail_limit {
6253                break;
6254            }
6255
6256            let candidate_next = candidate_idx + match_len;
6257            let current_next = idx + match_len;
6258            // SAFETY: first-differing positions after a match_len-long prefix;
6259            // match_len < tail_limit (break above) + BT-walk bound
6260            // idx/candidate_idx + tail_limit <= concat.len() keep both in range.
6261            if unsafe {
6262                *concat.get_unchecked(candidate_next) < *concat.get_unchecked(current_next)
6263            } {
6264                // SAFETY: `smaller_slot` holds a valid pair index (init
6265                // `pair_idx`, updated to `next_pair_idx + 1`); the `usize::MAX`
6266                // sentinel is set only just before `break`, never written here.
6267                unsafe { *chain_ptr.add(smaller_slot) = match_stored };
6268                common_length_smaller = match_len;
6269                if candidate_abs <= bt_low {
6270                    smaller_slot = usize::MAX;
6271                    break;
6272                }
6273                smaller_slot = next_pair_idx + 1;
6274                match_stored = next_larger;
6275            } else {
6276                // SAFETY: as above for `larger_slot`.
6277                unsafe { *chain_ptr.add(larger_slot) = match_stored };
6278                common_length_larger = match_len;
6279                if candidate_abs <= bt_low {
6280                    larger_slot = usize::MAX;
6281                    break;
6282                }
6283                larger_slot = next_pair_idx;
6284                match_stored = next_smaller;
6285            }
6286        }
6287
6288        // SAFETY: both slots, when not the `usize::MAX` sentinel, hold valid
6289        // pair indices into the hoisted `chain_table` base.
6290        if smaller_slot != usize::MAX {
6291            unsafe {
6292                *chain_ptr.add(smaller_slot) = $crate::encoding::match_table::storage::HC_EMPTY
6293            };
6294        }
6295        if larger_slot != usize::MAX {
6296            unsafe {
6297                *chain_ptr.add(larger_slot) = $crate::encoding::match_table::storage::HC_EMPTY
6298            };
6299        }
6300
6301        // Dict dual-probe (upstream zstd `ZSTD_dictMatchState`, zstd_opt.c:777-813):
6302        // after the live tree, descend the immutable dictionary BINARY TREE
6303        // (built in `prime_dms_bt`) with its OWN compare budget and push any
6304        // dict match longer than the live best into the ladder. The DUBT
6305        // descent reaches the longest dict match efficiently (a hash-chain
6306        // surfaced only the few same-bucket candidates and left most of the
6307        // dict savings unrealised at btlazy2 / btopt). Dict positions are
6308        // dictionary-relative concat indices in `[0, region)`, pinned at the
6309        // front of history, so a dict candidate at `dict_idx` sits at offset
6310        // `idx - dict_idx` (no upstream zstd `dmsIndexDelta`). The optimal parser
6311        // prices these (its DP lookahead values the repcode chain a dict match
6312        // seeds); the greedy/lazy parser commits the longest.
6313        if let Some(dms) = $table.dms.table() {
6314            let region = $table.dms.region_len();
6315            let dh = $crate::encoding::match_table::storage::MatchTable::hash_position_at(
6316                concat,
6317                idx,
6318                dms.hash_log,
6319                dms.mls,
6320            );
6321            let mut dcur = dms.hash_table[dh];
6322            // DUBT seed lengths: bytes already known common on each side, so
6323            // `$cmf` resumes from there (upstream zstd commonLengthSmaller/Larger).
6324            let mut common_smaller = 0usize;
6325            let mut common_larger = 0usize;
6326            let mut dms_compares = $profile.max_chain_depth.min($search_depth);
6327            while dms_compares > 0 && dcur != $crate::encoding::match_table::storage::HC_EMPTY {
6328                let dict_idx = (dcur - 1) as usize;
6329                // The dict tree holds only dict positions (`< region <= idx`).
6330                if dict_idx >= region || dict_idx >= idx {
6331                    break;
6332                }
6333                dms_compares -= 1;
6334                let pair = 2 * dict_idx;
6335                let seed = common_smaller.min(common_larger);
6336                // SAFETY: `dict_idx < idx` and `idx + tail_limit <=
6337                // concat.len()` (checked at entry); same umbrella as the live
6338                // walk's `$cmf`. `seed <= prior match_len <= tail_limit`.
6339                let match_len = unsafe { $cmf(concat, idx, dict_idx, tail_limit, seed) };
6340                if match_len > best_len {
6341                    let offset = idx - dict_idx;
6342                    let accepted = $crate::encoding::bt::BtMatcher::push_candidate_ladder(
6343                        $out,
6344                        $best_len_for_skip,
6345                        $crate::encoding::opt::types::MatchCandidate {
6346                            start: $abs_pos,
6347                            offset,
6348                            match_len,
6349                        },
6350                        $min_match_len,
6351                    );
6352                    if accepted {
6353                        best_len = match_len;
6354                        let candidate_end = $abs_pos + match_len;
6355                        if candidate_end > match_end_abs {
6356                            match_end_abs = candidate_end;
6357                        }
6358                        if match_len > $crate::encoding::cost_model::HC_OPT_NUM {
6359                            break;
6360                        }
6361                    }
6362                }
6363                // Match reached the block tail: can't order the pair (upstream zstd
6364                // `ip+matchLength == iLimit`), and indexing `concat[idx +
6365                // match_len]` below would step past the searchable region.
6366                if match_len >= tail_limit {
6367                    break;
6368                }
6369                // Descend the DUBT (upstream zstd zstd_opt.c:806-811): dict candidate
6370                // smaller than input → its larger child is closer to `idx`.
6371                if concat[dict_idx + match_len] < concat[idx + match_len] {
6372                    common_smaller = match_len;
6373                    dcur = dms.chain_table[pair + 1];
6374                } else {
6375                    common_larger = match_len;
6376                    dcur = dms.chain_table[pair];
6377                }
6378            }
6379        }
6380
6381        // `match_end_abs >= abs_pos + 9 >= 9` (initialized and monotonic),
6382        // so `match_end_abs - 8 >= 1` cannot underflow.
6383        $table.skip_insert_until_abs = match_end_abs - 8;
6384    }};
6385}
6386pub(crate) use bt_insert_and_collect_matches_body;
6387
6388impl HcMatchGenerator {
6389    /// Heap bytes this generator owns: the shared match table plus the BT
6390    /// backend's optimal-parser / LDM scratch (the HC knobs are inline).
6391    fn heap_size(&self) -> usize {
6392        self.table.heap_size() + self.backend.heap_size()
6393    }
6394
6395    fn should_run_btultra2_seed_pass<S: super::strategy::Strategy>(
6396        &self,
6397        current_len: usize,
6398    ) -> bool {
6399        // The in-block two-pass dynamic-stats seed (`initStats_ultra`)
6400        // is btultra2-only. `TWO_PASS_SEED` is `false` for every other
6401        // strategy — including btultra, which now shares the hash3
6402        // short-match probe but stays single-pass — so the seed call and
6403        // its body drop at codegen time for all non-btultra2 kernels.
6404        if !S::TWO_PASS_SEED {
6405            return false;
6406        }
6407        let HcBackend::Bt(bt) = &self.backend else {
6408            return false;
6409        };
6410        bt.opt_state.lit_length_sum == 0
6411            && bt.opt_state.dictionary_seed.is_none()
6412            && !self.table.dictionary_primed_for_frame
6413            && bt.ldm_sequences.is_empty()
6414            && self.table.window_size == current_len
6415            && self.table.history_abs_start == 0
6416            && self.table.chunk_lens.len() == 1
6417            && current_len > HC_PREDEF_THRESHOLD
6418    }
6419
6420    fn new(max_window_size: usize) -> Self {
6421        Self {
6422            table: super::match_table::storage::MatchTable::new(max_window_size),
6423            hc: super::hc::HcMatcher::new(2, HC_SEARCH_DEPTH, HC_TARGET_LEN),
6424            // Default to the zero-sized HC backend; `configure()` swaps
6425            // in a `BtMatcher` only when an optimal strategy lands.
6426            backend: HcBackend::Hc,
6427            // Lazy is the per-construct default — every production
6428            // caller calls `configure()` before the first encode and
6429            // overwrites this. Tests that drive `HcMatchGenerator`
6430            // without calling `configure()` end up in the
6431            // `start_matching_lazy` arm of the test dispatcher, which
6432            // matches the previous default behaviour.
6433            strategy_tag: super::strategy::StrategyTag::Lazy,
6434        }
6435    }
6436
6437    fn configure(&mut self, config: HcConfig, tag: super::strategy::StrategyTag, window_log: u8) {
6438        use super::strategy::StrategyTag;
6439        // Mirror the driver-resolved strategy tag so the
6440        // `#[cfg(test)] start_matching` dispatcher can route
6441        // BtOpt / BtUltra / BtUltra2 to distinct monomorphisations.
6442        self.strategy_tag = tag;
6443        let is_btultra2 = tag == StrategyTag::BtUltra2;
6444        let uses_bt = matches!(
6445            tag,
6446            StrategyTag::Btlazy2
6447                | StrategyTag::BtOpt
6448                | StrategyTag::BtUltra
6449                | StrategyTag::BtUltra2
6450        );
6451        // btultra and btultra2 both run the mls=3 hash3 short-match probe
6452        // (clevels.h minMatch 3). The `is_btultra2` flag below stays
6453        // exclusive to btultra2 because it tweaks the BT rebase boundary,
6454        // not match finding.
6455        let wants_hash3 = matches!(tag, StrategyTag::BtUltra | StrategyTag::BtUltra2);
6456        let next_hash3_log = if wants_hash3 {
6457            HC3_HASH_LOG.min(window_log as usize)
6458        } else {
6459            0
6460        };
6461        let resize = self.table.hash_log != config.hash_log
6462            || self.table.chain_log != config.chain_log
6463            || self.table.hash3_log != next_hash3_log;
6464        // Capture the layout flip BEFORE `uses_bt` is overwritten below — it
6465        // feeds the dms invalidation (the dms is keyed by layout too).
6466        let uses_bt_changed = self.table.uses_bt != uses_bt;
6467        self.table.hash_log = config.hash_log;
6468        self.table.chain_log = config.chain_log;
6469        self.table.hash3_log = next_hash3_log;
6470        self.hc.search_depth = if uses_bt {
6471            config.search_depth
6472        } else {
6473            config.search_depth.min(MAX_HC_SEARCH_DEPTH)
6474        };
6475        self.hc.target_len = config.target_len;
6476        // Mirror strategy-derived flags + HC search depth onto MatchTable
6477        // so the BT walker and rebase machinery can read them directly
6478        // without dispatching back through HcMatchGenerator.
6479        self.table.search_depth = self.hc.search_depth;
6480        self.table.is_btultra2 = is_btultra2;
6481        self.table.uses_bt = uses_bt;
6482        // BT finder hash width, upstream zstd `mls = BOUNDED(4, cParams.minMatch, 6)`,
6483        // carried explicitly in the level config so a `target_length` override
6484        // cannot silently flip the finder between 5- and 4-byte hashing. Only
6485        // the BT body reads it; HC/lazy levels leave it at 4. clevels.h
6486        // (srcSize > 256 KiB tier): btlazy2 L13-15 + btopt L16 are minMatch=5,
6487        // btopt L17 is minMatch=4, btultra/btultra2 are minMatch=3 (4-byte main
6488        // hash + the hash3 short-match probe).
6489        // The cached dms is keyed by the full (region, layout, mls, hash_log)
6490        // shape that `build_dms!` validates on the normal prime path, but the
6491        // reborrow fast path in `MatchTable::reset` reuses it on `dms.is_primed()`
6492        // ALONE. A reused-compressor level switch can change the search mls (e.g.
6493        // btlazy2 -> lazy), the table geometry (hash_log / chain_log / hash3,
6494        // captured in `resize`), OR the HC<->BT layout (`uses_bt_changed`)
6495        // independently of each other, and any of them leaves the dms hashed for
6496        // a different shape. Invalidate on ANY so the next dict frame re-primes at
6497        // the new shape (configure runs before reset) instead of probing a
6498        // mismatched dms and silently degrading match quality. Over-invalidation
6499        // only costs a re-prime, which a real shape change needs anyway.
6500        let mls_changed = self.table.search_mls != config.search_mls;
6501        if resize || mls_changed || uses_bt_changed {
6502            self.table.dms.invalidate();
6503        }
6504        self.table.search_mls = config.search_mls;
6505        // Stage D: promote the backend discriminator. HC modes drop the
6506        // BT scratch buffers entirely; switching back into a BT mode
6507        // allocates a fresh `BtMatcher` on demand.
6508        match (&self.backend, self.table.uses_bt) {
6509            (HcBackend::Hc, true) => {
6510                self.backend = HcBackend::Bt(alloc::boxed::Box::new(super::bt::BtMatcher::new()));
6511            }
6512            (HcBackend::Bt(_), false) => {
6513                self.backend = HcBackend::Hc;
6514            }
6515            _ => {}
6516        }
6517        if resize && !self.table.hash_table.is_empty() {
6518            // Force reallocation on next ensure_tables() call.
6519            self.table.hash_table.clear();
6520            self.table.hash3_table.clear();
6521            self.table.chain_table.clear();
6522        }
6523    }
6524
6525    fn seed_dictionary_entropy(
6526        &mut self,
6527        huff: Option<&crate::huff0::huff0_encoder::HuffmanTable>,
6528        ll: Option<&crate::fse::fse_encoder::FSETable>,
6529        ml: Option<&crate::fse::fse_encoder::FSETable>,
6530        of: Option<&crate::fse::fse_encoder::FSETable>,
6531    ) {
6532        if let HcBackend::Bt(bt) = &mut self.backend {
6533            bt.opt_state.seed_dictionary_entropy(huff, ll, ml, of);
6534        }
6535    }
6536
6537    /// Install (or clear) the long-distance-match producer (#27). Only
6538    /// the BT backend owns an `ldm_producer` slot; on the HC (lazy)
6539    /// backend the producer is dropped because there is no optimal-parser
6540    /// candidate buffer to seed. Call after [`Self::reset`].
6541    #[cfg(feature = "hash")]
6542    fn set_ldm_producer(&mut self, producer: Option<super::ldm::LdmProducer>) {
6543        if let HcBackend::Bt(bt) = &mut self.backend {
6544            bt.ldm_producer = producer;
6545        }
6546    }
6547
6548    /// Move the LDM producer out of the BT backend, leaving `None`. Used by the
6549    /// dictionary snapshot path: the producer carries no dictionary state (LDM
6550    /// is not dict-primed; its hash table is empty at capture), so it is not
6551    /// retained in the snapshot — the working frame's freshly-reset producer is
6552    /// reinstated on restore instead.
6553    #[cfg(feature = "hash")]
6554    fn take_ldm_producer(&mut self) -> Option<super::ldm::LdmProducer> {
6555        if let HcBackend::Bt(bt) = &mut self.backend {
6556            bt.ldm_producer.take()
6557        } else {
6558            None
6559        }
6560    }
6561
6562    fn reset(&mut self, reuse_space: impl FnMut(Vec<u8>)) {
6563        self.table.reset(reuse_space);
6564        if let HcBackend::Bt(bt) = &mut self.backend {
6565            bt.reset();
6566        }
6567    }
6568
6569    /// Backfill positions from the tail of the previous slice that couldn't be
6570    /// hashed at the time (insert_position needs 4 bytes of lookahead).
6571    fn skip_matching(&mut self, incompressible_hint: Option<bool>) {
6572        self.table.skip_matching(incompressible_hint);
6573    }
6574
6575    /// Runtime-dispatched entry kept only for in-crate tests. Production
6576    /// callers reach the inner loops through
6577    /// [`Self::start_matching_strategy`] / [`MatchGeneratorDriver::compress_block`]
6578    /// which pick the lazy / optimal arm from `S::USE_BT` at
6579    /// monomorphisation time.
6580    #[cfg(test)]
6581    fn start_matching(&mut self, mut handle_sequence: impl for<'a> FnMut(Sequence<'a>)) {
6582        use super::strategy::{self, StrategyTag};
6583        // Dispatch on the mirrored `strategy_tag` so each test runs
6584        // under the same monomorphisation production would pick.
6585        // `BtOpt` / `BtUltra` / `BtUltra2` remain distinct here even
6586        // though `table.uses_bt` / `is_btultra2` alone can't separate
6587        // BtOpt from BtUltra.
6588        match self.strategy_tag {
6589            StrategyTag::Fast | StrategyTag::Dfast | StrategyTag::Greedy | StrategyTag::Lazy => {
6590                self.start_matching_lazy(&mut handle_sequence)
6591            }
6592            StrategyTag::Btlazy2 => self.start_matching_btlazy2(&mut handle_sequence),
6593            StrategyTag::BtOpt => {
6594                self.start_matching_optimal::<strategy::BtOpt>(&mut handle_sequence)
6595            }
6596            StrategyTag::BtUltra => {
6597                self.start_matching_optimal::<strategy::BtUltra>(&mut handle_sequence)
6598            }
6599            StrategyTag::BtUltra2 => {
6600                self.start_matching_optimal::<strategy::BtUltra2>(&mut handle_sequence)
6601            }
6602        }
6603    }
6604
6605    /// Strategy-aware entry point used by
6606    /// [`MatchGeneratorDriver::compress_block`]. Branches on
6607    /// `S::USE_BT` — a compile-time `const` — so each
6608    /// monomorphisation keeps exactly one arm: `Lazy` /
6609    /// `Fast` / `Dfast` / `Greedy` see only `start_matching_lazy`,
6610    /// `BtOpt` / `BtUltra` / `BtUltra2` see only
6611    /// `start_matching_optimal`. The inherent test-only
6612    /// [`HcMatchGenerator::start_matching`] reaches the same arms by
6613    /// runtime-matching on `self.strategy_tag` (the parse-mode field
6614    /// has been removed); production never invokes that path.
6615    pub(crate) fn start_matching_strategy<S: super::strategy::Strategy>(
6616        &mut self,
6617        handle_sequence: &mut impl for<'a> FnMut(Sequence<'a>),
6618    ) {
6619        debug_assert_eq!(
6620            self.table.uses_bt,
6621            S::USE_BT,
6622            "Strategy::USE_BT disagrees with runtime table.uses_bt at HC dispatch"
6623        );
6624        if S::USE_BT {
6625            self.start_matching_optimal::<S>(handle_sequence)
6626        } else {
6627            self.start_matching_lazy(handle_sequence)
6628        }
6629    }
6630
6631    /// Dispatcher: pick the dict-aware monomorph when a separate dms is primed
6632    /// (attach-mode dictionary), else the no-dict monomorph. Mirrors upstream's
6633    /// compile-time `dictMode` split — the `DICT = false` body carries no dms
6634    /// code at all, so the no-dict hot path is unaffected by the dict search.
6635    pub(crate) fn start_matching_lazy(
6636        &mut self,
6637        handle_sequence: impl for<'a> FnMut(Sequence<'a>),
6638    ) {
6639        if self.table.dms.is_primed() {
6640            self.start_matching_lazy_impl::<true>(handle_sequence);
6641        } else {
6642            self.start_matching_lazy_impl::<false>(handle_sequence);
6643        }
6644    }
6645
6646    fn start_matching_lazy_impl<const DICT: bool>(
6647        &mut self,
6648        mut handle_sequence: impl for<'a> FnMut(Sequence<'a>),
6649    ) {
6650        self.table.ensure_tables();
6651
6652        // `current_block_range()` is borrowed-aware: owned → last committed
6653        // chunk; borrowed → the staged in-place block range.
6654        let (current_abs_start, current_len) = self.table.current_block_range();
6655        if current_len == 0 {
6656            return;
6657        }
6658        // The current block is the tail of `history` (owned) or the staged
6659        // borrowed range (`get_last_space()` resolves both). Hoist it as a raw
6660        // slice: the routine mutates the hash/chain tables + `offset_hist` but
6661        // never reallocates `history`, so the slice stays valid and we avoid
6662        // re-borrowing `self.table` (which would conflict with the
6663        // `offset_hist` write).
6664        let current_ptr = self.table.get_last_space().as_ptr();
6665        let current: &[u8] = unsafe { core::slice::from_raw_parts(current_ptr, current_len) };
6666
6667        let current_abs_end = current_abs_start + current_len;
6668        self.table
6669            .backfill_boundary_positions(current_abs_start, current_abs_end);
6670
6671        let mut pos = 0usize;
6672        let mut literals_start = 0usize;
6673        while pos + HC_MIN_MATCH_LEN <= current_len {
6674            let abs_pos = current_abs_start + pos;
6675            let lit_len = pos - literals_start;
6676
6677            let best = self
6678                .hc
6679                .find_best_match::<DICT>(&self.table, abs_pos, lit_len);
6680            if let Some(candidate) =
6681                self.hc
6682                    .pick_lazy_match::<DICT>(&self.table, abs_pos, lit_len, best)
6683            {
6684                self.table
6685                    .insert_match_span(abs_pos, candidate.start + candidate.match_len);
6686                let start = candidate.start - current_abs_start;
6687                let literals = &current[literals_start..start];
6688                handle_sequence(Sequence::Triple {
6689                    literals,
6690                    offset: candidate.offset,
6691                    match_len: candidate.match_len,
6692                });
6693                let _ = encode_offset_with_history(
6694                    candidate.offset as u32,
6695                    literals.len() as u32,
6696                    &mut self.table.offset_hist,
6697                );
6698                pos = start + candidate.match_len;
6699                literals_start = pos;
6700            } else {
6701                self.table.insert_position(abs_pos);
6702                // Lazy skipping (upstream zstd `ZSTD_compressBlock_lazy_generic`,
6703                // zstd_lazy.c:1614): advance faster over runs with no match.
6704                // `step = ((ip - anchor) >> kSearchStrength) + 1` with
6705                // kSearchStrength = 8, where `ip - anchor` is the current
6706                // literal-run length. On compressible input the run stays short
6707                // (step == 1, identical to a 1-byte advance); on incompressible
6708                // / dict-over-random input the run grows so the parser skips
6709                // ahead (one search per `step` positions) instead of searching
6710                // every byte. Skipped positions are not inserted, mirroring
6711                // upstream (it inserts only searched positions during a no-match
6712                // run). Ratio follows upstream (not byte-identical).
6713                let step = ((pos - literals_start) >> 8) + 1;
6714                pos += step;
6715                // No clamp needed before the tail loop: the search bound and the
6716                // hashable bound are both `pos + HC_MIN_MATCH_LEN <= current_len`
6717                // (HC_MIN_MATCH_LEN == 4 == the insert width), so there is no
6718                // non-searchable-but-hashable anchor to miss. Positions the skip
6719                // jumps over inside the searchable region are intentionally not
6720                // inserted — same as upstream zstd, which advances past them via
6721                // the identical `ip += step` and never hashes them either.
6722            }
6723        }
6724
6725        // Insert remaining hashable positions in the tail (the matching loop
6726        // stops at HC_MIN_MATCH_LEN but insert_position only needs 4 bytes).
6727        while pos + 4 <= current_len {
6728            self.table.insert_position(current_abs_start + pos);
6729            pos += 1;
6730        }
6731
6732        if literals_start < current_len {
6733            handle_sequence(Sequence::Literals {
6734                literals: &current[literals_start..],
6735            });
6736        }
6737    }
6738
6739    /// Register the borrowed input window for the no-copy one-shot path.
6740    /// # Safety
6741    /// `buffer` must outlive the borrowed scans (see `MatchTable`).
6742    pub(crate) unsafe fn set_borrowed_window(&mut self, buffer: &[u8]) {
6743        // SAFETY: forwarded liveness contract.
6744        unsafe { self.table.set_borrowed_window(buffer) };
6745    }
6746
6747    pub(crate) fn clear_borrowed_window(&mut self) {
6748        self.table.clear_borrowed_window();
6749    }
6750
6751    /// Borrowed (no-copy) equivalent of [`Self::start_matching_lazy`]: stage
6752    /// the in-place block range, then run the same lazy chain parse. The
6753    /// parse reads its range via `current_block_range()` and its bytes via
6754    /// `get_last_space()` / `live_history()`, all borrowed-aware, so the block
6755    /// is scanned in place with the per-position window_low offset cap.
6756    pub(crate) fn start_matching_lazy_borrowed(
6757        &mut self,
6758        block_start: usize,
6759        block_end: usize,
6760        handle_sequence: impl for<'a> FnMut(Sequence<'a>),
6761    ) {
6762        self.table.stage_borrowed_block(block_start, block_end);
6763        self.start_matching_lazy(handle_sequence);
6764    }
6765
6766    /// Borrowed (no-copy) equivalent of the lazy `skip_matching`: stage the
6767    /// in-place block, then seed positions without an owned-history append.
6768    pub(crate) fn skip_matching_borrowed(
6769        &mut self,
6770        block_start: usize,
6771        block_end: usize,
6772        incompressible_hint: Option<bool>,
6773    ) {
6774        self.table.stage_borrowed_block(block_start, block_end);
6775        self.table.skip_matching(incompressible_hint);
6776    }
6777
6778    /// Upstream zstd `ZSTD_btlazy2` (levels 13-15): binary-tree match finder with a
6779    /// greedy/lazy parse. Bare dispatcher — resolves the runtime tier ONCE
6780    /// per block via `select_kernel()` and calls the matching
6781    /// `start_matching_btlazy2_<kernel>` wrapper, so the per-position BT
6782    /// collect runs under a single `#[target_feature]` umbrella (mirrors
6783    /// `build_optimal_plan_impl`). See `start_matching_btlazy2_body!` for the
6784    /// shared loop.
6785    fn start_matching_btlazy2(&mut self, mut handle_sequence: impl for<'a> FnMut(Sequence<'a>)) {
6786        #[cfg(all(target_arch = "aarch64", target_endian = "little"))]
6787        unsafe {
6788            self.start_matching_btlazy2_neon(&mut handle_sequence)
6789        }
6790        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
6791        {
6792            use crate::encoding::fastpath::{FastpathKernel, select_kernel};
6793            match select_kernel() {
6794                FastpathKernel::Avx2Bmi2 => unsafe {
6795                    self.start_matching_btlazy2_avx2_bmi2(&mut handle_sequence)
6796                },
6797                FastpathKernel::Sse42 => unsafe {
6798                    self.start_matching_btlazy2_sse42(&mut handle_sequence)
6799                },
6800                FastpathKernel::Scalar => self.start_matching_btlazy2_scalar(&mut handle_sequence),
6801            }
6802        }
6803        #[cfg(not(any(
6804            all(target_arch = "aarch64", target_endian = "little"),
6805            target_arch = "x86",
6806            target_arch = "x86_64"
6807        )))]
6808        {
6809            self.start_matching_btlazy2_scalar(&mut handle_sequence)
6810        }
6811    }
6812
6813    #[cfg(all(target_arch = "aarch64", target_endian = "little"))]
6814    #[target_feature(enable = "neon")]
6815    unsafe fn start_matching_btlazy2_neon(
6816        &mut self,
6817        mut handle_sequence: impl for<'a> FnMut(Sequence<'a>),
6818    ) {
6819        start_matching_btlazy2_body!(
6820            self,
6821            handle_sequence,
6822            collect_optimal_candidates_initialized_neon,
6823            crate::encoding::fastpath::neon::count_match_from_indices
6824        )
6825    }
6826
6827    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
6828    #[target_feature(enable = "sse4.2")]
6829    unsafe fn start_matching_btlazy2_sse42(
6830        &mut self,
6831        mut handle_sequence: impl for<'a> FnMut(Sequence<'a>),
6832    ) {
6833        start_matching_btlazy2_body!(
6834            self,
6835            handle_sequence,
6836            collect_optimal_candidates_initialized_sse42,
6837            crate::encoding::fastpath::sse42::count_match_from_indices
6838        )
6839    }
6840
6841    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
6842    #[target_feature(enable = "avx2,bmi2")]
6843    unsafe fn start_matching_btlazy2_avx2_bmi2(
6844        &mut self,
6845        mut handle_sequence: impl for<'a> FnMut(Sequence<'a>),
6846    ) {
6847        start_matching_btlazy2_body!(
6848            self,
6849            handle_sequence,
6850            collect_optimal_candidates_initialized_avx2_bmi2,
6851            crate::encoding::fastpath::avx2_bmi2::count_match_from_indices
6852        )
6853    }
6854
6855    // Scalar wrapper: no `#[target_feature]`; `$collect` (the scalar collect)
6856    // is a safe fn, so the body macro's `unsafe` block is inert here. Same cfg
6857    // as `collect_optimal_candidates_initialized_scalar` (absent on
6858    // aarch64-little, where NEON is the baseline tier).
6859    #[cfg(not(all(target_arch = "aarch64", target_endian = "little")))]
6860    #[allow(unused_unsafe)]
6861    fn start_matching_btlazy2_scalar(
6862        &mut self,
6863        mut handle_sequence: impl for<'a> FnMut(Sequence<'a>),
6864    ) {
6865        start_matching_btlazy2_body!(
6866            self,
6867            handle_sequence,
6868            collect_optimal_candidates_initialized_scalar,
6869            crate::encoding::fastpath::scalar::count_match_from_indices
6870        )
6871    }
6872
6873    fn start_matching_optimal<S: super::strategy::Strategy>(
6874        &mut self,
6875        mut handle_sequence: impl for<'a> FnMut(Sequence<'a>),
6876    ) {
6877        self.table.ensure_tables();
6878        // Borrowed-aware: owned → last committed chunk; borrowed → staged
6879        // in-place block range.
6880        let (current_abs_start, current_len) = self.table.current_block_range();
6881        if current_len == 0 {
6882            return;
6883        }
6884        let current_ptr = self.table.get_last_space().as_ptr();
6885        // `start_matching_optimal()` mutates tables/state but never mutates or
6886        // reallocates `self.table.history`, so this tail slice remains valid for
6887        // the duration of the routine and avoids cloning the full block.
6888        let current = unsafe { core::slice::from_raw_parts(current_ptr, current_len) };
6889
6890        let current_abs_end = current_abs_start + current_len;
6891        self.table
6892            .apply_limited_update_after_long_match(current_abs_start);
6893        let hash3_start_cursor = self
6894            .table
6895            .skip_insert_until_abs
6896            .max(self.table.history_abs_start);
6897        self.table
6898            .backfill_boundary_positions(current_abs_start, current_abs_end);
6899        self.table.next_to_update3 = hash3_start_cursor;
6900        // Borrow split: `prepare_ldm_candidates` needs immutable
6901        // access to the live history (the post-`history_start`
6902        // slice of `self.table.history`) while it mutates the LDM
6903        // bucket table owned by `self.backend.bt_mut()`. Both live
6904        // in disjoint fields of `Self`, so we capture the slice +
6905        // its base before reaching for `bt_mut()`.
6906        //
6907        // The producer operates in absolute stream coordinates
6908        // throughout; `live_history[0]` corresponds to absolute
6909        // `history_abs_start` (upstream zstd `base + dictLimit`), and the
6910        // abs→slice translation happens inside the producer at
6911        // each `live_history[..]` access. Passing the full
6912        // `history` Vec would index into the dead prefix (the
6913        // bytes already retired past `history_start`).
6914        let live_history = self.table.live_history();
6915        let history_abs_start = self.table.history_abs_start;
6916        self.backend.bt_mut().prepare_ldm_candidates(
6917            live_history,
6918            history_abs_start,
6919            current_abs_start,
6920            current_len,
6921        );
6922
6923        if self.should_run_btultra2_seed_pass::<S>(current_len) {
6924            self.run_btultra2_seed_pass(current, current_abs_start, current_len);
6925        }
6926
6927        // Const-generic profile selection: every field is folded from
6928        // S's associated consts (MAX_CHAIN_DEPTH /
6929        // SUFFICIENT_MATCH_LEN / ACCURATE_PRICE / FAVOR_SMALL_OFFSETS),
6930        // so the optimiser produces the literal at codegen time
6931        // without a runtime match.
6932        let profile = HcOptimalCostProfile::const_for_strategy::<S>();
6933        let mut opt_state =
6934            core::mem::replace(&mut self.backend.bt_mut().opt_state, HcOptState::new());
6935        opt_state.rescale_freqs(current, profile);
6936        let mut best_plan = core::mem::take(&mut self.backend.bt_mut().opt_segment_plan_scratch);
6937        best_plan.clear();
6938        let mut plan_reps = self.table.offset_hist;
6939        let (mut cursor, mut plan_litlen) =
6940            self.table.opt_start_cursor_and_litlen(current_abs_start);
6941        let mut plan_literals_cursor = 0usize;
6942        let match_loop_limit = current_len.saturating_sub(8);
6943        while cursor < match_loop_limit {
6944            let remaining_len = current_len - cursor;
6945            let segment_abs_start = current_abs_start + cursor;
6946            let segment_start = best_plan.len();
6947            let (_, end_reps, end_litlen, consumed_len) = self.build_optimal_plan::<S>(
6948                &current[cursor..],
6949                segment_abs_start,
6950                remaining_len,
6951                HcOptimalPlanState {
6952                    block_offset: cursor,
6953                    reps: plan_reps,
6954                    litlen: plan_litlen,
6955                    profile,
6956                },
6957                &opt_state,
6958                &mut best_plan,
6959            );
6960            BtMatcher::update_plan_stats_segment(
6961                current,
6962                current_len,
6963                &best_plan[segment_start..],
6964                &mut plan_literals_cursor,
6965                &mut plan_reps,
6966                &mut opt_state,
6967                profile.accurate,
6968            );
6969            plan_reps = end_reps;
6970            plan_litlen = end_litlen;
6971            cursor += consumed_len;
6972        }
6973
6974        self.table
6975            .emit_optimal_plan(current_len, &best_plan, &mut handle_sequence);
6976        best_plan.clear();
6977        self.backend.bt_mut().opt_segment_plan_scratch = best_plan;
6978        self.backend.bt_mut().opt_state = opt_state;
6979    }
6980
6981    fn run_btultra2_seed_pass(
6982        &mut self,
6983        current: &[u8],
6984        current_abs_start: usize,
6985        current_len: usize,
6986    ) {
6987        // The seed pass is BtUltra2-exclusive by name (the only
6988        // caller is `should_run_btultra2_seed_pass`), so pin `S` to
6989        // `BtUltra2` for both the cost-profile lookup and the
6990        // `build_optimal_plan::<S>` call below.
6991        type S = super::strategy::BtUltra2;
6992        let seed_profile = HcOptimalCostProfile::const_for_strategy::<S>();
6993        let mut opt_state =
6994            core::mem::replace(&mut self.backend.bt_mut().opt_state, HcOptState::new());
6995        opt_state.rescale_freqs(current, seed_profile);
6996        let mut seed_reps = self.table.offset_hist;
6997        let (mut cursor, mut seed_litlen) =
6998            self.table.opt_start_cursor_and_litlen(current_abs_start);
6999        let mut seed_literals_cursor = 0usize;
7000        let mut seed_plan = core::mem::take(&mut self.backend.bt_mut().opt_seed_plan_scratch);
7001        seed_plan.clear();
7002        let match_loop_limit = current_len.saturating_sub(8);
7003        while cursor < match_loop_limit {
7004            let remaining_len = current_len - cursor;
7005            let segment_abs_start = current_abs_start + cursor;
7006            let segment_start = seed_plan.len();
7007            let (_, end_reps, end_litlen, consumed_len) = self.build_optimal_plan::<S>(
7008                &current[cursor..],
7009                segment_abs_start,
7010                remaining_len,
7011                HcOptimalPlanState {
7012                    block_offset: cursor,
7013                    reps: seed_reps,
7014                    litlen: seed_litlen,
7015                    profile: seed_profile,
7016                },
7017                &opt_state,
7018                &mut seed_plan,
7019            );
7020            BtMatcher::update_plan_stats_segment(
7021                current,
7022                current_len,
7023                &seed_plan[segment_start..],
7024                &mut seed_literals_cursor,
7025                &mut seed_reps,
7026                &mut opt_state,
7027                seed_profile.accurate,
7028            );
7029            seed_plan.truncate(segment_start);
7030            seed_reps = end_reps;
7031            seed_litlen = end_litlen;
7032            cursor += consumed_len;
7033        }
7034        seed_plan.clear();
7035        self.backend.bt_mut().opt_seed_plan_scratch = seed_plan;
7036        self.backend.bt_mut().opt_state = opt_state;
7037
7038        // Upstream zstd initStats_ultra keeps the collected entropy statistics but
7039        // invalidates the first-pass matchfinder history before the real pass.
7040        self.table.position_base = self.table.history_abs_start;
7041        self.table.index_shift = current_len;
7042        self.table.next_to_update3 = current_abs_start;
7043        self.table.skip_insert_until_abs = current_abs_start;
7044        // Upstream zstd `ZSTD_initStats_ultra()` invalidates the first scan by moving
7045        // `window.base` back by `srcSize`, making the real pass start at
7046        // `curr == srcSize` instead of 0. Position 0 is therefore a valid
7047        // table entry in the second pass even though raw C tables reserve
7048        // value 0 as empty during an unshifted first pass.
7049        self.table.allow_zero_relative_position = true;
7050    }
7051
7052    fn build_optimal_plan<S: super::strategy::Strategy>(
7053        &mut self,
7054        current: &[u8],
7055        current_abs_start: usize,
7056        current_len: usize,
7057        initial_state: HcOptimalPlanState,
7058        stats: &HcOptState,
7059        out: &mut Vec<HcOptimalSequence>,
7060    ) -> (u32, [u32; 3], usize, usize) {
7061        debug_assert!(S::USE_BT, "build_optimal_plan called on non-BT strategy");
7062        debug_assert_eq!(initial_state.profile.accurate, S::ACCURATE_PRICE);
7063        debug_assert_eq!(
7064            initial_state.profile.favor_small_offsets,
7065            S::FAVOR_SMALL_OFFSETS
7066        );
7067        // `S::ACCURATE_PRICE` / `S::FAVOR_SMALL_OFFSETS` cannot appear
7068        // as const-generic arguments yet (`generic_const_exprs` is
7069        // still unstable), so dispatch over a 4-arm match — but on the
7070        // strategy's ASSOCIATED CONSTS, not the runtime profile (the
7071        // `debug_assert_eq`s above pin the runtime profile to those
7072        // consts). A const scrutinee folds the three dead arms at
7073        // monomorphisation; matching the runtime profile instead kept
7074        // all four `#[inline(always)]` DP bodies (~16 KB each) alive in
7075        // EVERY `S` instantiation — ~360 KB of the wasm payload.
7076        match (S::ACCURATE_PRICE, S::FAVOR_SMALL_OFFSETS) {
7077            (true, false) => self.build_optimal_plan_impl::<S, true, false>(
7078                current,
7079                current_abs_start,
7080                current_len,
7081                initial_state,
7082                stats,
7083                out,
7084            ),
7085            (true, true) => self.build_optimal_plan_impl::<S, true, true>(
7086                current,
7087                current_abs_start,
7088                current_len,
7089                initial_state,
7090                stats,
7091                out,
7092            ),
7093            (false, false) => self.build_optimal_plan_impl::<S, false, false>(
7094                current,
7095                current_abs_start,
7096                current_len,
7097                initial_state,
7098                stats,
7099                out,
7100            ),
7101            (false, true) => self.build_optimal_plan_impl::<S, false, true>(
7102                current,
7103                current_abs_start,
7104                current_len,
7105                initial_state,
7106                stats,
7107                out,
7108            ),
7109        }
7110    }
7111
7112    /// Cross-platform DP entry. Picks the kernel-specific variant so the
7113    /// entire optimal-parser DP body (per-position match gathering, price
7114    /// updates, traceback) runs inside a single `target_feature` umbrella
7115    /// alongside the per-position `collect_optimal_candidates_initialized_
7116    /// <kernel>`. This eliminates the final ABI barrier on the hot per-
7117    /// position match-collection call — the level22 critical path is now
7118    /// one straight-line inline chain from DP body down through BT walk
7119    /// and match-length probes.
7120    #[inline(always)]
7121    fn build_optimal_plan_impl<
7122        S: super::strategy::Strategy,
7123        const ACCURATE_PRICE: bool,
7124        const FAVOR_SMALL_OFFSETS: bool,
7125    >(
7126        &mut self,
7127        current: &[u8],
7128        current_abs_start: usize,
7129        current_len: usize,
7130        initial_state: HcOptimalPlanState,
7131        stats: &HcOptState,
7132        out: &mut Vec<HcOptimalSequence>,
7133    ) -> (u32, [u32; 3], usize, usize) {
7134        #[cfg(all(target_arch = "aarch64", target_endian = "little"))]
7135        unsafe {
7136            self.build_optimal_plan_impl_neon::<S, ACCURATE_PRICE, FAVOR_SMALL_OFFSETS>(
7137                current,
7138                current_abs_start,
7139                current_len,
7140                initial_state,
7141                stats,
7142                out,
7143            )
7144        }
7145        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
7146        {
7147            use crate::encoding::fastpath::{FastpathKernel, select_kernel};
7148            match select_kernel() {
7149                FastpathKernel::Avx2Bmi2 => unsafe {
7150                    self.build_optimal_plan_impl_avx2_bmi2::<S, ACCURATE_PRICE, FAVOR_SMALL_OFFSETS>(
7151                        current,
7152                        current_abs_start,
7153                        current_len,
7154                        initial_state,
7155                        stats,
7156                        out,
7157                    )
7158                },
7159                FastpathKernel::Sse42 => unsafe {
7160                    self.build_optimal_plan_impl_sse42::<S, ACCURATE_PRICE, FAVOR_SMALL_OFFSETS>(
7161                        current,
7162                        current_abs_start,
7163                        current_len,
7164                        initial_state,
7165                        stats,
7166                        out,
7167                    )
7168                },
7169                FastpathKernel::Scalar => self
7170                    .build_optimal_plan_impl_scalar::<S, ACCURATE_PRICE, FAVOR_SMALL_OFFSETS>(
7171                        current,
7172                        current_abs_start,
7173                        current_len,
7174                        initial_state,
7175                        stats,
7176                        out,
7177                    ),
7178            }
7179        }
7180        // wasm with simd128: route through the simd128 DP body (4-lane price-set).
7181        #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
7182        unsafe {
7183            self.build_optimal_plan_impl_simd128::<S, ACCURATE_PRICE, FAVOR_SMALL_OFFSETS>(
7184                current,
7185                current_abs_start,
7186                current_len,
7187                initial_state,
7188                stats,
7189                out,
7190            )
7191        }
7192        #[cfg(not(any(
7193            all(target_arch = "aarch64", target_endian = "little"),
7194            target_arch = "x86",
7195            target_arch = "x86_64",
7196            all(target_arch = "wasm32", target_feature = "simd128")
7197        )))]
7198        {
7199            self.build_optimal_plan_impl_scalar::<S, ACCURATE_PRICE, FAVOR_SMALL_OFFSETS>(
7200                current,
7201                current_abs_start,
7202                current_len,
7203                initial_state,
7204                stats,
7205                out,
7206            )
7207        }
7208    }
7209
7210    /// NEON-umbrella DP body. Inlines
7211    /// `collect_optimal_candidates_initialized_neon` (and its entire
7212    /// per-position pipeline) directly into the DP loop.
7213    #[cfg(all(target_arch = "aarch64", target_endian = "little"))]
7214    #[target_feature(enable = "neon")]
7215    unsafe fn build_optimal_plan_impl_neon<
7216        S: super::strategy::Strategy,
7217        const ACCURATE_PRICE: bool,
7218        const FAVOR_SMALL_OFFSETS: bool,
7219    >(
7220        &mut self,
7221        current: &[u8],
7222        current_abs_start: usize,
7223        current_len: usize,
7224        initial_state: HcOptimalPlanState,
7225        stats: &HcOptState,
7226        out: &mut Vec<HcOptimalSequence>,
7227    ) -> (u32, [u32; 3], usize, usize) {
7228        build_optimal_plan_impl_body!(
7229            self,
7230            S,
7231            current,
7232            current_abs_start,
7233            current_len,
7234            initial_state,
7235            stats,
7236            out,
7237            collect_optimal_candidates_initialized_neon,
7238            priceset_range_nonabort_neon,
7239        )
7240    }
7241
7242    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
7243    #[target_feature(enable = "sse4.2")]
7244    unsafe fn build_optimal_plan_impl_sse42<
7245        S: super::strategy::Strategy,
7246        const ACCURATE_PRICE: bool,
7247        const FAVOR_SMALL_OFFSETS: bool,
7248    >(
7249        &mut self,
7250        current: &[u8],
7251        current_abs_start: usize,
7252        current_len: usize,
7253        initial_state: HcOptimalPlanState,
7254        stats: &HcOptState,
7255        out: &mut Vec<HcOptimalSequence>,
7256    ) -> (u32, [u32; 3], usize, usize) {
7257        build_optimal_plan_impl_body!(
7258            self,
7259            S,
7260            current,
7261            current_abs_start,
7262            current_len,
7263            initial_state,
7264            stats,
7265            out,
7266            collect_optimal_candidates_initialized_sse42,
7267            priceset_range_nonabort_sse41,
7268        )
7269    }
7270
7271    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
7272    #[target_feature(enable = "avx2,bmi2")]
7273    unsafe fn build_optimal_plan_impl_avx2_bmi2<
7274        S: super::strategy::Strategy,
7275        const ACCURATE_PRICE: bool,
7276        const FAVOR_SMALL_OFFSETS: bool,
7277    >(
7278        &mut self,
7279        current: &[u8],
7280        current_abs_start: usize,
7281        current_len: usize,
7282        initial_state: HcOptimalPlanState,
7283        stats: &HcOptState,
7284        out: &mut Vec<HcOptimalSequence>,
7285    ) -> (u32, [u32; 3], usize, usize) {
7286        build_optimal_plan_impl_body!(
7287            self,
7288            S,
7289            current,
7290            current_abs_start,
7291            current_len,
7292            initial_state,
7293            stats,
7294            out,
7295            collect_optimal_candidates_initialized_avx2_bmi2,
7296            priceset_range_nonabort_avx2,
7297        )
7298    }
7299
7300    #[cfg(not(all(target_arch = "aarch64", target_endian = "little")))]
7301    // Body macros wrap callees in `unsafe { }` for the NEON/AVX/SSE
7302    // variants where callees are `unsafe fn`. The scalar wrappers route
7303    // through safe fns, so those blocks are redundant on this path.
7304    #[allow(unused_unsafe)]
7305    // The dispatch reaches this only on non-SIMD x86 (Scalar tier) and the
7306    // portable fallback; on wasm+simd128 the simd128 wrapper is selected, so
7307    // this is cfg-dead there.
7308    #[cfg_attr(
7309        all(target_arch = "wasm32", target_feature = "simd128"),
7310        allow(dead_code)
7311    )]
7312    fn build_optimal_plan_impl_scalar<
7313        S: super::strategy::Strategy,
7314        const ACCURATE_PRICE: bool,
7315        const FAVOR_SMALL_OFFSETS: bool,
7316    >(
7317        &mut self,
7318        current: &[u8],
7319        current_abs_start: usize,
7320        current_len: usize,
7321        initial_state: HcOptimalPlanState,
7322        stats: &HcOptState,
7323        out: &mut Vec<HcOptimalSequence>,
7324    ) -> (u32, [u32; 3], usize, usize) {
7325        build_optimal_plan_impl_body!(
7326            self,
7327            S,
7328            current,
7329            current_abs_start,
7330            current_len,
7331            initial_state,
7332            stats,
7333            out,
7334            collect_optimal_candidates_initialized_scalar,
7335            priceset_range_nonabort_scalar,
7336        )
7337    }
7338
7339    /// wasm `simd128`-umbrella DP body: scalar candidate collection (no wasm
7340    /// collect kernel) but the simd128 4-lane price-set.
7341    #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
7342    #[target_feature(enable = "simd128")]
7343    // With `+simd128` in the wasm baseline the shared body macro's `unsafe`
7344    // blocks (needed by the safe scalar wrapper) are redundant inside this
7345    // target_feature fn.
7346    #[allow(unused_unsafe)]
7347    unsafe fn build_optimal_plan_impl_simd128<
7348        S: super::strategy::Strategy,
7349        const ACCURATE_PRICE: bool,
7350        const FAVOR_SMALL_OFFSETS: bool,
7351    >(
7352        &mut self,
7353        current: &[u8],
7354        current_abs_start: usize,
7355        current_len: usize,
7356        initial_state: HcOptimalPlanState,
7357        stats: &HcOptState,
7358        out: &mut Vec<HcOptimalSequence>,
7359    ) -> (u32, [u32; 3], usize, usize) {
7360        build_optimal_plan_impl_body!(
7361            self,
7362            S,
7363            current,
7364            current_abs_start,
7365            current_len,
7366            initial_state,
7367            stats,
7368            out,
7369            collect_optimal_candidates_initialized_scalar,
7370            priceset_range_nonabort_simd128,
7371        )
7372    }
7373
7374    #[cfg(test)]
7375    fn collect_optimal_candidates(
7376        &mut self,
7377        abs_pos: usize,
7378        current_abs_end: usize,
7379        profile: HcOptimalCostProfile,
7380        query: HcCandidateQuery,
7381        out: &mut Vec<MatchCandidate>,
7382    ) {
7383        use super::strategy::{self, StrategyTag};
7384        self.table.ensure_tables();
7385        // Dispatch purely from `self.strategy_tag` (set by
7386        // `configure()`). Tests must configure the matcher the same
7387        // way production does — wiring up `table.hash3_log` directly
7388        // without setting a matching `strategy_tag` is no longer
7389        // allowed.
7390        match self.strategy_tag {
7391            StrategyTag::BtUltra2 => self
7392                .collect_optimal_candidates_initialized::<strategy::BtUltra2, true>(
7393                    abs_pos,
7394                    current_abs_end,
7395                    profile,
7396                    query,
7397                    out,
7398                ),
7399            StrategyTag::BtUltra => self
7400                .collect_optimal_candidates_initialized::<strategy::BtUltra, true>(
7401                    abs_pos,
7402                    current_abs_end,
7403                    profile,
7404                    query,
7405                    out,
7406                ),
7407            StrategyTag::Btlazy2 => self
7408                .collect_optimal_candidates_initialized::<strategy::Btlazy2, true>(
7409                    abs_pos,
7410                    current_abs_end,
7411                    profile,
7412                    query,
7413                    out,
7414                ),
7415            StrategyTag::BtOpt => self
7416                .collect_optimal_candidates_initialized::<strategy::BtOpt, true>(
7417                    abs_pos,
7418                    current_abs_end,
7419                    profile,
7420                    query,
7421                    out,
7422                ),
7423            StrategyTag::Fast | StrategyTag::Dfast | StrategyTag::Greedy | StrategyTag::Lazy => {
7424                self.collect_optimal_candidates_initialized::<strategy::Lazy, false>(
7425                    abs_pos,
7426                    current_abs_end,
7427                    profile,
7428                    query,
7429                    out,
7430                )
7431            }
7432        }
7433    }
7434
7435    /// Cross-platform entry. Picks the kernel-specific variant so the per-
7436    /// position pipeline (BT-tree fill, rep probing, hash3 probing, BT
7437    /// collect / HC chain walk) runs inside a single `target_feature`
7438    /// umbrella — all inner SIMD probes inline without ABI barriers.
7439    ///
7440    /// The on-encode hot path bypasses this dispatcher: `build_optimal_plan_impl_<kernel>`
7441    /// calls the matching `_<kernel>` variant directly. This entry is kept
7442    /// for the cfg(test)-only `collect_optimal_candidates` shim and any
7443    /// future caller that isn't already inside a kernel umbrella.
7444    #[allow(dead_code)]
7445    #[inline(always)]
7446    fn collect_optimal_candidates_initialized<
7447        S: super::strategy::Strategy,
7448        const USE_BT_MATCHFINDER: bool,
7449    >(
7450        &mut self,
7451        abs_pos: usize,
7452        current_abs_end: usize,
7453        profile: HcOptimalCostProfile,
7454        query: HcCandidateQuery,
7455        out: &mut Vec<MatchCandidate>,
7456    ) {
7457        #[cfg(all(target_arch = "aarch64", target_endian = "little"))]
7458        unsafe {
7459            self.collect_optimal_candidates_initialized_neon::<S, USE_BT_MATCHFINDER>(
7460                abs_pos,
7461                current_abs_end,
7462                profile,
7463                query,
7464                out,
7465            )
7466        }
7467        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
7468        {
7469            use crate::encoding::fastpath::{FastpathKernel, select_kernel};
7470            match select_kernel() {
7471                FastpathKernel::Avx2Bmi2 => unsafe {
7472                    self.collect_optimal_candidates_initialized_avx2_bmi2::<S, USE_BT_MATCHFINDER>(
7473                        abs_pos,
7474                        current_abs_end,
7475                        profile,
7476                        query,
7477                        out,
7478                    )
7479                },
7480                FastpathKernel::Sse42 => unsafe {
7481                    self.collect_optimal_candidates_initialized_sse42::<S, USE_BT_MATCHFINDER>(
7482                        abs_pos,
7483                        current_abs_end,
7484                        profile,
7485                        query,
7486                        out,
7487                    )
7488                },
7489                FastpathKernel::Scalar => self
7490                    .collect_optimal_candidates_initialized_scalar::<S, USE_BT_MATCHFINDER>(
7491                        abs_pos,
7492                        current_abs_end,
7493                        profile,
7494                        query,
7495                        out,
7496                    ),
7497            }
7498        }
7499        #[cfg(not(any(
7500            all(target_arch = "aarch64", target_endian = "little"),
7501            target_arch = "x86",
7502            target_arch = "x86_64"
7503        )))]
7504        {
7505            self.collect_optimal_candidates_initialized_scalar::<S, USE_BT_MATCHFINDER>(
7506                abs_pos,
7507                current_abs_end,
7508                profile,
7509                query,
7510                out,
7511            )
7512        }
7513    }
7514
7515    /// NEON-umbrella variant. Every inner helper (`bt_update_tree_until_neon`,
7516    /// `for_each_repcode_candidate_with_reps_neon`, `hash3_candidate_neon`,
7517    /// `bt_insert_and_collect_matches_neon`, `fastpath::neon::
7518    /// common_prefix_len_ptr`) shares the NEON umbrella so the per-position
7519    /// pipeline executes as a single straight-line inline sequence.
7520    #[cfg(all(target_arch = "aarch64", target_endian = "little"))]
7521    #[target_feature(enable = "neon")]
7522    unsafe fn collect_optimal_candidates_initialized_neon<
7523        S: super::strategy::Strategy,
7524        const USE_BT_MATCHFINDER: bool,
7525    >(
7526        &mut self,
7527        abs_pos: usize,
7528        current_abs_end: usize,
7529        profile: HcOptimalCostProfile,
7530        query: HcCandidateQuery,
7531        out: &mut Vec<MatchCandidate>,
7532    ) {
7533        collect_optimal_candidates_initialized_body!(
7534            self,
7535            S,
7536            abs_pos,
7537            current_abs_end,
7538            profile,
7539            query,
7540            out,
7541            USE_BT_MATCHFINDER,
7542            bt_update_tree_until_neon,
7543            bt_insert_and_collect_matches_neon,
7544            for_each_repcode_candidate_with_reps_neon,
7545            hash3_candidate_neon,
7546            crate::encoding::fastpath::neon::common_prefix_len_ptr,
7547        )
7548    }
7549
7550    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
7551    #[target_feature(enable = "sse4.2")]
7552    unsafe fn collect_optimal_candidates_initialized_sse42<
7553        S: super::strategy::Strategy,
7554        const USE_BT_MATCHFINDER: bool,
7555    >(
7556        &mut self,
7557        abs_pos: usize,
7558        current_abs_end: usize,
7559        profile: HcOptimalCostProfile,
7560        query: HcCandidateQuery,
7561        out: &mut Vec<MatchCandidate>,
7562    ) {
7563        collect_optimal_candidates_initialized_body!(
7564            self,
7565            S,
7566            abs_pos,
7567            current_abs_end,
7568            profile,
7569            query,
7570            out,
7571            USE_BT_MATCHFINDER,
7572            bt_update_tree_until_sse42,
7573            bt_insert_and_collect_matches_sse42,
7574            for_each_repcode_candidate_with_reps_sse42,
7575            hash3_candidate_sse42,
7576            crate::encoding::fastpath::sse42::common_prefix_len_ptr,
7577        )
7578    }
7579
7580    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
7581    #[target_feature(enable = "avx2,bmi2")]
7582    unsafe fn collect_optimal_candidates_initialized_avx2_bmi2<
7583        S: super::strategy::Strategy,
7584        const USE_BT_MATCHFINDER: bool,
7585    >(
7586        &mut self,
7587        abs_pos: usize,
7588        current_abs_end: usize,
7589        profile: HcOptimalCostProfile,
7590        query: HcCandidateQuery,
7591        out: &mut Vec<MatchCandidate>,
7592    ) {
7593        collect_optimal_candidates_initialized_body!(
7594            self,
7595            S,
7596            abs_pos,
7597            current_abs_end,
7598            profile,
7599            query,
7600            out,
7601            USE_BT_MATCHFINDER,
7602            bt_update_tree_until_avx2_bmi2,
7603            bt_insert_and_collect_matches_avx2_bmi2,
7604            for_each_repcode_candidate_with_reps_avx2_bmi2,
7605            hash3_candidate_avx2_bmi2,
7606            crate::encoding::fastpath::avx2_bmi2::common_prefix_len_ptr,
7607        )
7608    }
7609
7610    #[cfg(not(all(target_arch = "aarch64", target_endian = "little")))]
7611    // Macro emits `unsafe { }` wrappers for NEON/AVX/SSE variants; scalar
7612    // callees are safe so the blocks are redundant here only.
7613    #[allow(unused_unsafe)]
7614    fn collect_optimal_candidates_initialized_scalar<
7615        S: super::strategy::Strategy,
7616        const USE_BT_MATCHFINDER: bool,
7617    >(
7618        &mut self,
7619        abs_pos: usize,
7620        current_abs_end: usize,
7621        profile: HcOptimalCostProfile,
7622        query: HcCandidateQuery,
7623        out: &mut Vec<MatchCandidate>,
7624    ) {
7625        collect_optimal_candidates_initialized_body!(
7626            self,
7627            S,
7628            abs_pos,
7629            current_abs_end,
7630            profile,
7631            query,
7632            out,
7633            USE_BT_MATCHFINDER,
7634            bt_update_tree_until_scalar,
7635            bt_insert_and_collect_matches_scalar,
7636            for_each_repcode_candidate_with_reps_scalar,
7637            hash3_candidate_scalar,
7638            crate::encoding::fastpath::scalar::common_prefix_len_ptr,
7639        )
7640    }
7641}
7642
7643#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
7644#[test]
7645fn matches() {
7646    let mut matcher = MatchGenerator::new(1000);
7647    let mut original_data = Vec::new();
7648    let mut reconstructed = Vec::new();
7649
7650    let replay_sequence = |seq: Sequence<'_>, reconstructed: &mut Vec<u8>| match seq {
7651        Sequence::Literals { literals } => {
7652            assert!(!literals.is_empty());
7653            reconstructed.extend_from_slice(literals);
7654        }
7655        Sequence::Triple {
7656            literals,
7657            offset,
7658            match_len,
7659        } => {
7660            assert!(offset > 0);
7661            assert!(match_len >= MIN_MATCH_LEN);
7662            reconstructed.extend_from_slice(literals);
7663            assert!(offset <= reconstructed.len());
7664            let start = reconstructed.len() - offset;
7665            for i in 0..match_len {
7666                let byte = reconstructed[start + i];
7667                reconstructed.push(byte);
7668            }
7669        }
7670    };
7671
7672    matcher.add_data(
7673        alloc::vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
7674        SuffixStore::with_capacity(100),
7675        |_, _| {},
7676    );
7677    original_data.extend_from_slice(&[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
7678
7679    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7680
7681    assert!(!matcher.next_sequence(|_| {}));
7682
7683    matcher.add_data(
7684        alloc::vec![
7685            1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 0, 0, 0, 0, 0,
7686        ],
7687        SuffixStore::with_capacity(100),
7688        |_, _| {},
7689    );
7690    original_data.extend_from_slice(&[
7691        1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 0, 0, 0, 0, 0,
7692    ]);
7693
7694    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7695    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7696    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7697    assert!(!matcher.next_sequence(|_| {}));
7698
7699    matcher.add_data(
7700        alloc::vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 0, 0, 0, 0],
7701        SuffixStore::with_capacity(100),
7702        |_, _| {},
7703    );
7704    original_data.extend_from_slice(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 0, 0, 0, 0]);
7705
7706    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7707    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7708    assert!(!matcher.next_sequence(|_| {}));
7709
7710    matcher.add_data(
7711        alloc::vec![0, 0, 0, 0, 0],
7712        SuffixStore::with_capacity(100),
7713        |_, _| {},
7714    );
7715    original_data.extend_from_slice(&[0, 0, 0, 0, 0]);
7716
7717    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7718    assert!(!matcher.next_sequence(|_| {}));
7719
7720    matcher.add_data(
7721        alloc::vec![7, 8, 9, 10, 11],
7722        SuffixStore::with_capacity(100),
7723        |_, _| {},
7724    );
7725    original_data.extend_from_slice(&[7, 8, 9, 10, 11]);
7726
7727    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7728    assert!(!matcher.next_sequence(|_| {}));
7729
7730    matcher.add_data(
7731        alloc::vec![1, 3, 5, 7, 9],
7732        SuffixStore::with_capacity(100),
7733        |_, _| {},
7734    );
7735    matcher.skip_matching();
7736    original_data.extend_from_slice(&[1, 3, 5, 7, 9]);
7737    reconstructed.extend_from_slice(&[1, 3, 5, 7, 9]);
7738    assert!(!matcher.next_sequence(|_| {}));
7739
7740    matcher.add_data(
7741        alloc::vec![1, 3, 5, 7, 9],
7742        SuffixStore::with_capacity(100),
7743        |_, _| {},
7744    );
7745    original_data.extend_from_slice(&[1, 3, 5, 7, 9]);
7746
7747    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7748    assert!(!matcher.next_sequence(|_| {}));
7749
7750    matcher.add_data(
7751        alloc::vec![0, 0, 11, 13, 15, 17, 20, 11, 13, 15, 17, 20, 21, 23],
7752        SuffixStore::with_capacity(100),
7753        |_, _| {},
7754    );
7755    original_data.extend_from_slice(&[0, 0, 11, 13, 15, 17, 20, 11, 13, 15, 17, 20, 21, 23]);
7756
7757    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7758    matcher.next_sequence(|seq| replay_sequence(seq, &mut reconstructed));
7759    assert!(!matcher.next_sequence(|_| {}));
7760
7761    assert_eq!(reconstructed, original_data);
7762}
7763
7764#[test]
7765fn dfast_matches_roundtrip_multi_block_pattern() {
7766    let pattern = [9, 21, 44, 184, 19, 96, 171, 109, 141, 251];
7767    let first_block: Vec<u8> = pattern.iter().copied().cycle().take(128 * 1024).collect();
7768    let second_block: Vec<u8> = pattern.iter().copied().cycle().take(128 * 1024).collect();
7769
7770    let mut matcher = DfastMatchGenerator::new(1 << 22);
7771    let replay_sequence = |decoded: &mut Vec<u8>, seq: Sequence<'_>| match seq {
7772        Sequence::Literals { literals } => decoded.extend_from_slice(literals),
7773        Sequence::Triple {
7774            literals,
7775            offset,
7776            match_len,
7777        } => {
7778            decoded.extend_from_slice(literals);
7779            let start = decoded.len() - offset;
7780            for i in 0..match_len {
7781                let byte = decoded[start + i];
7782                decoded.push(byte);
7783            }
7784        }
7785    };
7786
7787    matcher.add_data(first_block.clone(), |_| {});
7788    let mut history = Vec::new();
7789    matcher.start_matching(|seq| replay_sequence(&mut history, seq));
7790    assert_eq!(history, first_block);
7791
7792    matcher.add_data(second_block.clone(), |_| {});
7793    let prefix_len = history.len();
7794    matcher.start_matching(|seq| replay_sequence(&mut history, seq));
7795
7796    assert_eq!(&history[prefix_len..], second_block.as_slice());
7797}
7798
7799/// Regression for the `DFAST_MIN_MATCH_LEN: 6 -> 5` drop. The fixture
7800/// is built so the longest available match is EXACTLY 5 bytes — a
7801/// matcher that still effectively requires a 6-byte floor would emit
7802/// only literals here and the assertion would catch the silent
7803/// 5-byte miss.
7804///
7805/// Fixture layout (34 B):
7806///   bytes 0..5    `"ABCDE"`  — match source
7807///   bytes 5..28   `'!'` × 23 — filler that does NOT start with 'A'
7808///   bytes 28..33  `"ABCDE"`  — match site (repeats the prefix)
7809///   byte  33      `'F'`      — terminator: differs from byte 5 (`'!'`),
7810///                              so the forward extension at the match
7811///                              site stops at exactly length 5.
7812///
7813/// A 5-byte match at offset 28 must be emitted; a 6-byte+ match at the
7814/// same offset must NOT.
7815#[test]
7816fn dfast_accepts_exact_five_byte_match() {
7817    // Layout the input so that:
7818    //   byte  0      = 'Z'            (lead byte — keeps the match SOURCE off
7819    //                                  position 0, which the greedy loop never
7820    //                                  inserts: like the upstream zstd it starts the
7821    //                                  cursor at ip+1 and hashes only visited
7822    //                                  positions)
7823    //   bytes 1..6   = "ABCDE"        (the match source — position 1 IS visited)
7824    //   bytes 6..29  = 23 filler bytes that do NOT start with 'A'
7825    //   bytes 29..34 = "ABCDE"        (the 5-byte match site)
7826    //   byte  34     = 'F'            (differs from byte 6 = '!')
7827    // The longest available copy at position 29 is exactly 5 bytes:
7828    // the byte at position 34 ('F') differs from the byte at position 6
7829    // ('!'), so the forward extension stops at length 5.
7830    let mut data = Vec::new();
7831    data.push(b'Z'); // 0
7832    data.extend_from_slice(b"ABCDE"); // 1..6
7833    data.extend_from_slice(b"!!!!!!!!!!!!!!!!!!!!!!!"); // 6..29 (23 bytes)
7834    data.extend_from_slice(b"ABCDE"); // 29..34
7835    data.push(b'F'); // 34: forces forward extension to stop at length 5
7836    // Trailing filler so the match site (29) sits at least HASH_READ_SIZE (8)
7837    // bytes before the block end. The greedy double-fast — like the upstream zstd —
7838    // stops probing at `ilimit = iend - HASH_READ_SIZE`, so a match in the
7839    // final 8 bytes is never searched (upstream zstd parity, not a regression).
7840    data.extend_from_slice(b"GHIJKLMNOPQRSTUVWXYZ"); // 35..55
7841    assert_eq!(data.len(), 55);
7842
7843    let mut matcher = DfastMatchGenerator::new(1 << 22);
7844    matcher.add_data(data.clone(), |_| {});
7845
7846    let mut saw_five_byte_match = false;
7847    let mut saw_longer_match = false;
7848    matcher.start_matching(|seq| {
7849        if let Sequence::Triple {
7850            offset, match_len, ..
7851        } = seq
7852        {
7853            if offset == 28 && match_len == 5 {
7854                saw_five_byte_match = true;
7855            } else if offset == 28 && match_len > 5 {
7856                saw_longer_match = true;
7857            }
7858        }
7859    });
7860
7861    assert!(
7862        saw_five_byte_match,
7863        "dfast must accept the exact-5-byte match — a 6-byte floor would skip it"
7864    );
7865    assert!(
7866        !saw_longer_match,
7867        "fixture pinned to length 5 — byte 33 ('F') must terminate the extension"
7868    );
7869}
7870
7871#[test]
7872fn driver_switches_backends_and_initializes_dfast_via_reset() {
7873    let mut driver = MatchGeneratorDriver::new(32, 2);
7874
7875    driver.reset(CompressionLevel::Default);
7876    assert_eq!(driver.active_backend(), super::strategy::BackendTag::Dfast);
7877    assert_eq!(driver.window_size(), (1u64 << 21));
7878
7879    let mut first = driver.get_next_space();
7880    first[..12].copy_from_slice(b"abcabcabcabc");
7881    first.truncate(12);
7882    driver.commit_space(first);
7883    assert_eq!(driver.get_last_space(), b"abcabcabcabc");
7884    driver.skip_matching_with_hint(None);
7885
7886    let mut second = driver.get_next_space();
7887    second[..12].copy_from_slice(b"abcabcabcabc");
7888    second.truncate(12);
7889    driver.commit_space(second);
7890
7891    let mut reconstructed = b"abcabcabcabc".to_vec();
7892    driver.start_matching(|seq| match seq {
7893        Sequence::Literals { literals } => reconstructed.extend_from_slice(literals),
7894        Sequence::Triple {
7895            literals,
7896            offset,
7897            match_len,
7898        } => {
7899            reconstructed.extend_from_slice(literals);
7900            let start = reconstructed.len() - offset;
7901            for i in 0..match_len {
7902                let byte = reconstructed[start + i];
7903                reconstructed.push(byte);
7904            }
7905        }
7906    });
7907    assert_eq!(reconstructed, b"abcabcabcabcabcabcabcabc");
7908
7909    driver.reset(CompressionLevel::Fastest);
7910    assert_eq!(driver.window_size(), (1u64 << 19));
7911}
7912
7913#[test]
7914fn driver_level5_selects_row_backend() {
7915    let mut driver = MatchGeneratorDriver::new(32, 2);
7916    driver.reset(CompressionLevel::Level(5));
7917    assert_eq!(driver.active_backend(), super::strategy::BackendTag::Row);
7918    // Greedy-specific routing assertion: `MatchGeneratorDriver::start_matching`
7919    // dispatches the Row backend into `start_matching_greedy` iff
7920    // `self.parse == ParseMode::Greedy`, so assert that actual selector —
7921    // round-trip alone passes on the lazy parser too. `row_matcher().lazy_depth`
7922    // is a secondary corroboration of the same routing decision (a mirror of
7923    // the parse mode); checking `parse` directly catches a regression even if
7924    // the two ever drift apart.
7925    assert_eq!(
7926        driver.parse,
7927        super::strategy::ParseMode::Greedy,
7928        "L5 must route to start_matching_greedy (parse == Greedy)",
7929    );
7930    assert_eq!(
7931        driver.row_matcher().lazy_depth,
7932        0,
7933        "row matcher lazy_depth must mirror the greedy parse mode",
7934    );
7935}
7936
7937/// Level 4 maps to `StrategyTag::Dfast` (the greedy double-fast, upstream zstd
7938/// `ZSTD_dfast` — "greedy" is the parse discipline, not the Row/Greedy
7939/// strategy at Level 5). Round-trip alone doesn't pin match quality (a lazy
7940/// parser would also reconstruct the input correctly), so this test guards the
7941/// parse output itself: a small repeating pattern must produce at least one
7942/// `Sequence::Triple`, so a future regression that emits literals-only (e.g. a
7943/// `min_match` or rep-probe guard regression) is caught.
7944#[test]
7945fn driver_level4_greedy_round_trip_single_slice() {
7946    let mut driver = MatchGeneratorDriver::new(64, 2);
7947    driver.reset(CompressionLevel::Level(4));
7948    let input = b"abcdefgh_abcdefgh_abcdefgh_abcdefgh";
7949    let mut space = driver.get_next_space();
7950    space[..input.len()].copy_from_slice(input);
7951    space.truncate(input.len());
7952    driver.commit_space(space);
7953
7954    let mut reconstructed: Vec<u8> = Vec::new();
7955    let mut saw_triple = false;
7956    driver.start_matching(|seq| match seq {
7957        Sequence::Literals { literals } => reconstructed.extend_from_slice(literals),
7958        Sequence::Triple {
7959            literals,
7960            offset,
7961            match_len,
7962        } => {
7963            saw_triple = true;
7964            reconstructed.extend_from_slice(literals);
7965            let start = reconstructed.len() - offset;
7966            for i in 0..match_len {
7967                let byte = reconstructed[start + i];
7968                reconstructed.push(byte);
7969            }
7970        }
7971    });
7972    assert_eq!(
7973        reconstructed,
7974        input.to_vec(),
7975        "L4 greedy parse failed to reconstruct repeating-pattern input",
7976    );
7977    assert!(
7978        saw_triple,
7979        "L4 greedy parse on a repeating pattern must emit at least one match (Triple)",
7980    );
7981}
7982
7983#[test]
7984fn driver_level4_greedy_round_trip_cross_slice() {
7985    // Verifies that the greedy parse carries repcode / hash-table state
7986    // across slice boundaries: the second slice repeats the first byte
7987    // for byte, so the parse must pick up matches reaching back into
7988    // the previous slice's history.
7989    let mut driver = MatchGeneratorDriver::new(32, 4);
7990    driver.reset(CompressionLevel::Level(4));
7991    let chunk = b"the quick brown fox jumps over!!";
7992    assert_eq!(chunk.len(), 32);
7993
7994    let mut first = driver.get_next_space();
7995    first[..chunk.len()].copy_from_slice(chunk);
7996    first.truncate(chunk.len());
7997    driver.commit_space(first);
7998
7999    let mut first_recon: Vec<u8> = Vec::new();
8000    driver.start_matching(|seq| match seq {
8001        Sequence::Literals { literals } => first_recon.extend_from_slice(literals),
8002        Sequence::Triple {
8003            literals,
8004            offset,
8005            match_len,
8006        } => {
8007            first_recon.extend_from_slice(literals);
8008            let start = first_recon.len() - offset;
8009            for i in 0..match_len {
8010                let byte = first_recon[start + i];
8011                first_recon.push(byte);
8012            }
8013        }
8014    });
8015    assert_eq!(
8016        first_recon,
8017        chunk.to_vec(),
8018        "first slice failed to round-trip"
8019    );
8020
8021    let mut second = driver.get_next_space();
8022    second[..chunk.len()].copy_from_slice(chunk);
8023    second.truncate(chunk.len());
8024    driver.commit_space(second);
8025
8026    let mut full = first_recon.clone();
8027    let mut saw_cross_slice_match = false;
8028    driver.start_matching(|seq| match seq {
8029        Sequence::Literals { literals } => full.extend_from_slice(literals),
8030        Sequence::Triple {
8031            literals,
8032            offset,
8033            match_len,
8034        } => {
8035            // A match whose offset reaches >= the current slice's literal
8036            // run plus the second slice's index means we matched into the
8037            // first slice — exactly the cross-slice behavior under test.
8038            if offset >= chunk.len() {
8039                saw_cross_slice_match = true;
8040            }
8041            full.extend_from_slice(literals);
8042            let start = full.len() - offset;
8043            for i in 0..match_len {
8044                let byte = full[start + i];
8045                full.push(byte);
8046            }
8047        }
8048    });
8049    let mut expected = chunk.to_vec();
8050    expected.extend_from_slice(chunk);
8051    assert_eq!(
8052        full, expected,
8053        "cross-slice L4 greedy parse failed to reconstruct"
8054    );
8055    assert!(
8056        saw_cross_slice_match,
8057        "L4 greedy parse must match across slice boundaries (history is shared)",
8058    );
8059}
8060
8061/// Helper: round-trip `data` through the L4 greedy parse and assert
8062/// the reconstructed bytes match. Returns `(triple_count, max_offset)`
8063/// so callers can probe parse shape (matches emitted, max-offset).
8064#[cfg(test)]
8065impl MatchGeneratorDriver {
8066    /// Test-only: stage a parse×search recipe override applied on the
8067    /// next `reset()`. Routes a level through a non-default (parse,
8068    /// search) pair so the decoupling can be exercised end-to-end.
8069    pub(crate) fn set_config_override(
8070        &mut self,
8071        search: super::strategy::SearchMethod,
8072        parse: super::strategy::ParseMode,
8073    ) {
8074        self.config_override = Some((search, parse));
8075    }
8076
8077    /// Test-only: reset `level` routed onto the lazy HashChain pairing.
8078    /// The lazy band runs on the Row backend in production, so HC-specific
8079    /// behaviour (live-chain dict prime, eviction budget accounting, seed
8080    /// pass gates) is exercised through this override-backed reset.
8081    pub(crate) fn reset_on_hc_lazy(&mut self, level: CompressionLevel) {
8082        self.set_config_override(
8083            super::strategy::SearchMethod::HashChain,
8084            super::strategy::ParseMode::Lazy2,
8085        );
8086        self.reset(level);
8087    }
8088}
8089
8090/// Drive a full compress parse for `data` at `level` (optionally with a
8091/// parse×search override) and reconstruct the bytes from the emitted
8092/// sequences. The returned buffer must equal `data` for a correct parse.
8093#[cfg(test)]
8094fn drive_roundtrip_with_override(
8095    level: CompressionLevel,
8096    over: Option<(super::strategy::SearchMethod, super::strategy::ParseMode)>,
8097    data: &[u8],
8098) -> Vec<u8> {
8099    let mut driver = MatchGeneratorDriver::new(1 << 17, 8);
8100    if let Some((s, p)) = over {
8101        driver.set_config_override(s, p);
8102    }
8103    driver.reset(level);
8104
8105    let mut out: Vec<u8> = Vec::with_capacity(data.len());
8106    let mut offset_in_data = 0usize;
8107    while offset_in_data < data.len() {
8108        let mut space = driver.get_next_space();
8109        let take = (data.len() - offset_in_data).min(space.len());
8110        space[..take].copy_from_slice(&data[offset_in_data..offset_in_data + take]);
8111        space.truncate(take);
8112        driver.commit_space(space);
8113        offset_in_data += take;
8114
8115        driver.start_matching(|seq| match seq {
8116            Sequence::Literals { literals } => out.extend_from_slice(literals),
8117            Sequence::Triple {
8118                literals,
8119                offset,
8120                match_len,
8121            } => {
8122                out.extend_from_slice(literals);
8123                let start = out.len() - offset;
8124                for i in 0..match_len {
8125                    let byte = out[start + i];
8126                    out.push(byte);
8127                }
8128            }
8129        });
8130    }
8131    out
8132}
8133
8134/// Phase 1 capability proof: parse and search are decoupled, so a level
8135/// can run any parse mode on any non-opt search backend. Greedy-on-
8136/// HashChain and Lazy2-on-RowHash are pairings the legacy `strategy_tag`
8137/// could not express; both must reconstruct the input exactly.
8138#[test]
8139fn parse_search_matrix_decoupled_roundtrips() {
8140    use super::strategy::{ParseMode, SearchMethod};
8141    // Mixed repetitive + literal payload that exercises matches and reps.
8142    let mut data = Vec::new();
8143    for i in 0..4000u32 {
8144        data.extend_from_slice(b"the quick brown fox ");
8145        data.extend_from_slice(&i.to_le_bytes());
8146    }
8147
8148    // Greedy parse on the HashChain search backend (legacy: Greedy was
8149    // welded to RowHash).
8150    let got = drive_roundtrip_with_override(
8151        CompressionLevel::Level(5),
8152        Some((SearchMethod::HashChain, ParseMode::Greedy)),
8153        &data,
8154    );
8155    assert_eq!(got, data, "greedy-on-hashchain diverged");
8156
8157    // Lazy2 parse on the RowHash search backend (legacy: Lazy was welded
8158    // to HashChain).
8159    let got = drive_roundtrip_with_override(
8160        CompressionLevel::Level(8),
8161        Some((SearchMethod::RowHash, ParseMode::Lazy2)),
8162        &data,
8163    );
8164    assert_eq!(got, data, "lazy2-on-rowhash diverged");
8165
8166    // Lazy on RowHash too (depth 1).
8167    let got = drive_roundtrip_with_override(
8168        CompressionLevel::Level(6),
8169        Some((SearchMethod::RowHash, ParseMode::Lazy)),
8170        &data,
8171    );
8172    assert_eq!(got, data, "lazy-on-rowhash diverged");
8173}
8174
8175/// The row `mls` knob (C-like `minMatch`) is respected: every accepted
8176/// match (regular row + repcode, on the lazy parse) is at least `mls`
8177/// bytes, and the stream still round-trips for the whole 4..=7 range. The
8178/// default (5) reproduces the historical `ROW_MIN_MATCH_LEN` behaviour.
8179#[test]
8180fn row_mls_knob_gates_matches_and_roundtrips() {
8181    let data: Vec<u8> = (0..4000u32)
8182        .flat_map(|i| {
8183            let mut v = b"abcdefgh".to_vec();
8184            v.extend_from_slice(&i.to_le_bytes());
8185            v
8186        })
8187        .collect();
8188
8189    for mls in [4usize, 5, 6, 7] {
8190        let mut matcher = RowMatchGenerator::new(1 << 22);
8191        let mut cfg = ROW_CONFIG;
8192        cfg.mls = mls;
8193        matcher.configure(cfg);
8194        matcher.add_data(data.clone(), |_| {});
8195
8196        let mut out: Vec<u8> = Vec::with_capacity(data.len());
8197        let mut shortest_match = usize::MAX;
8198        matcher.start_matching(|seq| match seq {
8199            Sequence::Literals { literals } => out.extend_from_slice(literals),
8200            Sequence::Triple {
8201                literals,
8202                offset,
8203                match_len,
8204            } => {
8205                out.extend_from_slice(literals);
8206                shortest_match = shortest_match.min(match_len);
8207                let start = out.len() - offset;
8208                for i in 0..match_len {
8209                    let byte = out[start + i];
8210                    out.push(byte);
8211                }
8212            }
8213        });
8214
8215        assert_eq!(out, data, "mls={mls} round-trip diverged");
8216        if shortest_match != usize::MAX {
8217            assert!(
8218                shortest_match >= mls,
8219                "mls={mls}: emitted a {shortest_match}-byte match below the floor",
8220            );
8221        }
8222    }
8223}
8224
8225/// `LevelParams::parse()` derives the parse mode from the `search` axis, not
8226/// the strategy tag, so the decoupling holds even for a `Bt*`-tagged level
8227/// overridden to a non-BT search backend. Pre-fix the method matched on
8228/// `strategy_tag` and returned `Optimal` for any `Bt*` tag regardless of
8229/// `search`/`lazy_depth`.
8230#[test]
8231fn parse_mode_follows_search_axis_not_strategy_tag() {
8232    use super::strategy::{ParseMode, SearchMethod};
8233    // LEVEL_TABLE[15] is level 16: BtOpt tag, BinaryTree search.
8234    let mut p = LEVEL_TABLE[15];
8235    assert_eq!(p.parse(), ParseMode::Optimal, "BinaryTree search → Optimal");
8236    // Override the Bt-tagged level's search to a non-BT backend: parse must
8237    // follow the search axis (derive from lazy_depth), not stay Optimal.
8238    p.search = SearchMethod::RowHash;
8239    p.lazy_depth = 0;
8240    assert_eq!(p.parse(), ParseMode::Greedy, "RowHash + depth 0 → Greedy");
8241    p.lazy_depth = 2;
8242    assert_eq!(p.parse(), ParseMode::Lazy2, "RowHash + depth 2 → Lazy2");
8243}
8244
8245/// The test-only `config_override` is consumed by the first `reset()` (one
8246/// shot), so a reused driver does not silently keep the synthetic pairing
8247/// armed across later resets. Pre-fix `reset()` copied the override and left
8248/// it set.
8249#[test]
8250fn config_override_is_consumed_by_reset() {
8251    use super::strategy::{ParseMode, SearchMethod};
8252    let mut driver = MatchGeneratorDriver::new(1 << 17, 8);
8253    driver.set_config_override(SearchMethod::RowHash, ParseMode::Lazy2);
8254    assert!(driver.config_override.is_some());
8255    driver.reset(CompressionLevel::Level(5));
8256    assert!(
8257        driver.config_override.is_none(),
8258        "override must be consumed after one reset",
8259    );
8260}
8261
8262// Level 4 maps to the greedy Dfast (double-fast) backend — "greedy" here is the
8263// parse discipline (no lazy lookahead, upstream zstd `ZSTD_dfast`), NOT the Row/Greedy
8264// strategy (which is Level 5). This roundtrip is intentional Dfast L4 coverage;
8265// the Row backend is exercised by the `Level(5)` fixtures elsewhere in this file.
8266#[cfg(test)]
8267fn l4_greedy_round_trip(slice_size: usize, max_slices: usize, data: &[u8]) -> (usize, usize) {
8268    let mut driver = MatchGeneratorDriver::new(slice_size, max_slices);
8269    driver.reset(CompressionLevel::Level(4));
8270
8271    let mut reconstructed: Vec<u8> = Vec::with_capacity(data.len());
8272    let mut triple_count = 0usize;
8273    let mut max_offset = 0usize;
8274
8275    // `start_matching` consumes the current pending slice; multi-slice
8276    // payloads require commit + drive per slice so earlier slices'
8277    // bytes actually round-trip out before they're displaced from the
8278    // window.
8279    let mut offset_in_data = 0usize;
8280    while offset_in_data < data.len() {
8281        let mut space = driver.get_next_space();
8282        let space_cap = space.len();
8283        let take = (data.len() - offset_in_data).min(space_cap);
8284        space[..take].copy_from_slice(&data[offset_in_data..offset_in_data + take]);
8285        space.truncate(take);
8286        driver.commit_space(space);
8287        offset_in_data += take;
8288
8289        driver.start_matching(|seq| match seq {
8290            Sequence::Literals { literals } => reconstructed.extend_from_slice(literals),
8291            Sequence::Triple {
8292                literals,
8293                offset,
8294                match_len,
8295            } => {
8296                triple_count += 1;
8297                if offset > max_offset {
8298                    max_offset = offset;
8299                }
8300                reconstructed.extend_from_slice(literals);
8301                let start = reconstructed.len() - offset;
8302                for i in 0..match_len {
8303                    let byte = reconstructed[start + i];
8304                    reconstructed.push(byte);
8305                }
8306            }
8307        });
8308    }
8309
8310    // Empty payload still needs one commit/drive round so the empty-
8311    // input path of `start_matching_greedy` (the `current_len == 0`
8312    // early-return guard) gets exercised.
8313    if data.is_empty() {
8314        let mut space = driver.get_next_space();
8315        space.truncate(0);
8316        driver.commit_space(space);
8317        driver.start_matching(|seq| match seq {
8318            Sequence::Literals { literals } => reconstructed.extend_from_slice(literals),
8319            Sequence::Triple { .. } => panic!("empty input must not emit any matches"),
8320        });
8321    }
8322
8323    assert_eq!(reconstructed, data, "L4 greedy round-trip diverged");
8324    (triple_count, max_offset)
8325}
8326
8327/// CodeRabbit-flagged tail rep-only case: the previous outer-loop
8328/// guard `pos + ROW_MIN_MATCH_LEN <= current_len` (6) meant the last
8329/// 5-byte position was unreachable. The rep probe at `abs_pos + 1`
8330/// only needs 4 bytes of lookahead beyond the probe point, so the
8331/// guard was relaxed to `pos + GREEDY_MIN_LOOKAHEAD <= current_len`
8332/// (5). This test drives the slices separately and asserts a match
8333/// is emitted **from the second slice's parse pass**, so a future
8334/// regression that re-tightens the guard or breaks the cross-slice
8335/// repcode lookup fails the test instead of being masked by
8336/// first-slice matches.
8337#[test]
8338fn driver_level5_greedy_tail_rep_only_reachable() {
8339    // Period-4 first slice locks rep1 = 4 into `offset_hist` by the
8340    // time the parse reaches the slice tail. Second slice is exactly
8341    // 5 bytes ( = `GREEDY_MIN_LOOKAHEAD`) so the outer loop runs
8342    // **once** at `pos = 0`; the regular `row_candidate` requires 6
8343    // bytes from `abs_pos`, which is past the live history, so the
8344    // only viable hit is the `abs_pos + 1` rep probe. `second[0..]`
8345    // is shaped so the rep probe at `abs_pos + 1` finds a 4-byte
8346    // match at offset 4 (`second[1..5] == first[13..16] ++ second[0]
8347    // == "BCDA"`), and `extend_backwards_shared` then absorbs
8348    // `second[0]` into the match (extending one byte back into the
8349    // implicit anchor, no further because anchor itself is the
8350    // current `abs_pos`).
8351    let first: &[u8] = b"ABCDABCDABCDABCD"; // 16 bytes — strict period 4
8352    let second: &[u8] = b"ABCDA"; // 5 bytes — exact GREEDY_MIN_LOOKAHEAD
8353    let mut driver = MatchGeneratorDriver::new(16, 2);
8354    driver.reset(CompressionLevel::Level(5));
8355
8356    let mut first_space = driver.get_next_space();
8357    first_space[..first.len()].copy_from_slice(first);
8358    first_space.truncate(first.len());
8359    driver.commit_space(first_space);
8360    driver.start_matching(|_| {});
8361
8362    let mut second_space = driver.get_next_space();
8363    second_space[..second.len()].copy_from_slice(second);
8364    second_space.truncate(second.len());
8365    driver.commit_space(second_space);
8366
8367    let mut second_slice_triples = 0usize;
8368    driver.start_matching(|seq| {
8369        if matches!(seq, Sequence::Triple { .. }) {
8370            second_slice_triples += 1;
8371        }
8372    });
8373
8374    assert!(
8375        second_slice_triples >= 1,
8376        "tail rep-only position must produce a match in the second slice \
8377         (got {second_slice_triples} triples)",
8378    );
8379}
8380
8381#[test]
8382fn driver_level4_greedy_empty_input_emits_nothing() {
8383    // Empty input: no slices committed → no sequences emitted, no
8384    // panic. Exercises the `current_len == 0` early-return guard at
8385    // the top of `start_matching_greedy`.
8386    let mut driver = MatchGeneratorDriver::new(64, 2);
8387    driver.reset(CompressionLevel::Level(4));
8388    // Commit an empty space so the matcher has SOMETHING to start
8389    // matching on (otherwise `start_matching` panics on the
8390    // `window.back()` unwrap — that's a separate path covered by
8391    // existing reset tests).
8392    let mut space = driver.get_next_space();
8393    space.truncate(0);
8394    driver.commit_space(space);
8395    let mut emitted_anything = false;
8396    driver.start_matching(|_| emitted_anything = true);
8397    assert!(!emitted_anything, "empty slice must not emit any sequences",);
8398}
8399
8400#[test]
8401fn driver_level4_greedy_sub_min_lookahead_input() {
8402    // Input shorter than `GREEDY_MIN_LOOKAHEAD = 5` — the outer loop
8403    // never executes a body iteration; the tail literal path must
8404    // still emit the input bytes as a single `Sequence::Literals`.
8405    let data: &[u8] = b"abcd"; // 4 bytes
8406    let (triples, _) = l4_greedy_round_trip(64, 2, data);
8407    assert_eq!(
8408        triples, 0,
8409        "sub-min-lookahead input must not emit any matches (got {triples})",
8410    );
8411}
8412
8413#[test]
8414fn driver_level4_greedy_incompressible_input() {
8415    // Pseudo-random bytes with no exploitable structure — every
8416    // position is a "miss" in both the rep probe and the row
8417    // candidate. Exercises the miss branch + `SKIP_STRENGTH = 10`
8418    // skip-step grow (irrelevant at this size, but the path runs).
8419    let mut data = alloc::vec::Vec::with_capacity(256);
8420    let mut x: u32 = 0xDEAD_BEEF;
8421    for _ in 0..256 {
8422        x = x.wrapping_mul(1_103_515_245).wrapping_add(12345);
8423        data.push((x >> 16) as u8);
8424    }
8425    let (_triples, _) = l4_greedy_round_trip(64, 8, &data);
8426    // No structural assertion — the test passes if round-trip is
8427    // bit-exact and no panic / debug_assert fires.
8428}
8429
8430#[test]
8431fn driver_level4_greedy_long_literal_run_skip_step_growth() {
8432    // 2 KiB of unstructured bytes drives the literal-run length past
8433    // the `SKIP_STRENGTH = 10` threshold (~1 KiB), so the miss branch
8434    // + per-miss step-grow path in `start_matching_greedy` is
8435    // exercised. This test is a stress smoke — it only asserts
8436    // bit-exact round-trip + no panic / `debug_assert!` fires; it
8437    // does NOT pin the `SKIP_STRENGTH` constant or the per-iteration
8438    // step count (round-trip would still pass on `SKIP_STRENGTH = 6`
8439    // or `= 14` since both produce valid sequences). Pinning the
8440    // exact step growth would require returning step / iteration
8441    // metadata from the parse, which is invasive plumbing for a
8442    // constant that hasn't been re-tuned in months. The value of
8443    // this test is catching panics or correctness regressions on
8444    // long incompressible runs, which is what its existing
8445    // round-trip assertion checks.
8446    let mut data = alloc::vec::Vec::with_capacity(2048);
8447    let mut x: u32 = 0xC0FF_EE00;
8448    for _ in 0..2048 {
8449        x = x.wrapping_mul(0x9E37_79B9).wrapping_add(0xCAFEBABE);
8450        data.push((x >> 24) as u8);
8451    }
8452    let (_triples, _) = l4_greedy_round_trip(512, 8, &data);
8453}
8454
8455#[test]
8456fn driver_level4_greedy_all_zeros_heavy_rep1() {
8457    // All zeros: every position after the first byte has `byte[pos]
8458    // == byte[pos - 1]`, so the rep1 probe at `abs_pos + 1` hits
8459    // immediately and the parse collapses to a single long match.
8460    // Exercises the `cheap rep at +1, full-match length` path.
8461    let data: Vec<u8> = alloc::vec![0u8; 128];
8462    let (triples, max_offset) = l4_greedy_round_trip(64, 8, &data);
8463    assert!(
8464        triples >= 1,
8465        "all-zeros input must produce at least one rep1 match",
8466    );
8467    // The dominant match should reference rep1 (offset 1), since
8468    // every byte at pos matches pos-1. A larger offset would
8469    // indicate the rep1 probe was bypassed.
8470    assert_eq!(
8471        max_offset, 1,
8472        "all-zeros L4 greedy parse should commit at offset 1 (got {max_offset})",
8473    );
8474}
8475
8476/// Periodic-pattern payload covers the steady-state rep-cascade path
8477/// of the greedy parse — the main-loop rep probe at `abs_pos + 1`
8478/// fires every iteration once the period is locked into
8479/// `offset_hist[0]`, and the parse emits a long chain of triples at
8480/// the same offset.
8481#[test]
8482fn driver_level4_greedy_periodic_pattern_rep_cascade() {
8483    let unit: &[u8] = b"alpha_beta_gamma";
8484    assert_eq!(unit.len(), 16);
8485    let mut data: Vec<u8> = Vec::with_capacity(unit.len() * 32);
8486    for _ in 0..32 {
8487        data.extend_from_slice(unit);
8488    }
8489    let (triples, max_offset) = l4_greedy_round_trip(64, 16, &data);
8490    assert!(
8491        triples >= 1,
8492        "periodic 16-byte payload must emit matches (got {triples})",
8493    );
8494    assert!(
8495        max_offset >= 16,
8496        "periodic 16-byte payload must produce at least one offset >= 16 \
8497         (got max_offset = {max_offset})",
8498    );
8499}
8500
8501#[test]
8502fn driver_reset_keeps_strategy_tag_in_sync_with_active_backend() {
8503    use super::strategy::StrategyTag;
8504
8505    fn check(level: CompressionLevel, expected: StrategyTag) {
8506        let mut driver = MatchGeneratorDriver::new(32, 2);
8507        driver.reset(level);
8508        assert_eq!(
8509            driver.strategy_tag, expected,
8510            "strategy_tag wrong for {level:?}"
8511        );
8512        assert_eq!(
8513            driver.strategy_tag.backend(),
8514            driver.active_backend(),
8515            "strategy_tag backend disagrees with active_backend for {level:?}"
8516        );
8517    }
8518
8519    check(CompressionLevel::Level(1), StrategyTag::Fast);
8520    check(CompressionLevel::Level(2), StrategyTag::Fast);
8521    check(CompressionLevel::Level(3), StrategyTag::Dfast);
8522    check(CompressionLevel::Level(4), StrategyTag::Dfast);
8523    check(CompressionLevel::Level(5), StrategyTag::Greedy);
8524    check(CompressionLevel::Level(7), StrategyTag::Lazy);
8525    check(CompressionLevel::Level(12), StrategyTag::Lazy);
8526    check(CompressionLevel::Level(13), StrategyTag::Btlazy2);
8527    check(CompressionLevel::Level(14), StrategyTag::Btlazy2);
8528    check(CompressionLevel::Level(15), StrategyTag::Btlazy2);
8529    check(CompressionLevel::Level(16), StrategyTag::BtOpt);
8530    check(CompressionLevel::Level(18), StrategyTag::BtUltra);
8531    check(CompressionLevel::Level(22), StrategyTag::BtUltra2);
8532    check(CompressionLevel::Fastest, StrategyTag::Fast);
8533    check(CompressionLevel::Default, StrategyTag::Dfast);
8534    check(CompressionLevel::Better, StrategyTag::Lazy);
8535    // `Best` sits on level 13 (the first dominant point of the deep band).
8536    check(CompressionLevel::Best, StrategyTag::Btlazy2);
8537}
8538
8539#[test]
8540fn level_16_17_map_to_btopt_strategy() {
8541    use super::strategy::{BackendTag, StrategyTag};
8542    let p16 = resolve_level_params(CompressionLevel::Level(16), None);
8543    let p17 = resolve_level_params(CompressionLevel::Level(17), None);
8544    assert_eq!(p16.backend(), BackendTag::HashChain);
8545    assert_eq!(p17.backend(), BackendTag::HashChain);
8546    assert_eq!(StrategyTag::for_level(16), StrategyTag::BtOpt);
8547    assert_eq!(StrategyTag::for_level(17), StrategyTag::BtOpt);
8548}
8549
8550#[test]
8551fn level_18_maps_to_btultra_level_19_to_btultra2_strategy() {
8552    use super::strategy::{BackendTag, StrategyTag};
8553    // Upstream zstd `clevels.h` (srcSize > 256 KiB tier): level 18 = `ZSTD_btultra`,
8554    // level 19 = `ZSTD_btultra2`. Level 19 was previously mapped to plain
8555    // btultra, which under-searched (searchLog 6 vs 7) and lost ~3.7% ratio
8556    // on the repo corpus.
8557    let p18 = resolve_level_params(CompressionLevel::Level(18), None);
8558    let p19 = resolve_level_params(CompressionLevel::Level(19), None);
8559    assert_eq!(p18.backend(), BackendTag::HashChain);
8560    assert_eq!(p19.backend(), BackendTag::HashChain);
8561    assert_eq!(StrategyTag::for_level(18), StrategyTag::BtUltra);
8562    assert_eq!(StrategyTag::for_level(19), StrategyTag::BtUltra2);
8563}
8564
8565#[test]
8566fn level_20_22_map_to_btultra2_strategy() {
8567    use super::strategy::{BackendTag, StrategyTag};
8568    for level in 20..=22 {
8569        let params = resolve_level_params(CompressionLevel::Level(level), None);
8570        assert_eq!(params.backend(), BackendTag::HashChain);
8571        assert_eq!(StrategyTag::for_level(level as u8), StrategyTag::BtUltra2);
8572    }
8573}
8574
8575#[test]
8576fn level22_uses_target_length_and_large_input_tables() {
8577    let params = resolve_level_params(CompressionLevel::Level(22), None);
8578    assert_eq!(params.window_log, 27);
8579    let hc = params.hc.unwrap();
8580    assert_eq!(hc.hash_log, 25);
8581    assert_eq!(hc.chain_log, 27);
8582    assert_eq!(hc.search_depth, 1 << 9);
8583    assert_eq!(hc.target_len, 999);
8584}
8585
8586#[test]
8587fn bt_levels_16_to_21_pin_clevels_params() {
8588    // Pins the BT-level (window_log, hash_log, chain_log, search_depth,
8589    // target_len) tuples so the clevels.h alignment cannot silently drift.
8590    // Levels 16-20 mirror upstream `clevels.h` (srcSize > 256 KiB tier,
8591    // search_depth = 1 << searchLog); level 21 intentionally keeps a deeper
8592    // search_depth (512 vs upstream's 128) — it beats C on ratio there and
8593    // the deeper walk is a deliberate ratio-positive divergence.
8594    let expected = [
8595        // (level, window_log, hash_log, chain_log, search_depth, target_len)
8596        (16u8, 22u8, 22usize, 22usize, 32usize, 48usize),
8597        (17, 23, 22, 23, 32, 64),
8598        (18, 23, 22, 23, 64, 64),
8599        (19, 23, 22, 24, 128, 256),
8600        (20, 25, 23, 25, 128, 256),
8601        (21, 26, 24, 24, 512, 256),
8602    ];
8603    for (level, wlog, hlog, clog, sd, tl) in expected {
8604        let p = resolve_level_params(CompressionLevel::Level(level as i32), None);
8605        assert_eq!(p.window_log, wlog, "level {level} window_log");
8606        let hc = p.hc.unwrap();
8607        assert_eq!(hc.hash_log, hlog, "level {level} hash_log");
8608        assert_eq!(hc.chain_log, clog, "level {level} chain_log");
8609        assert_eq!(hc.search_depth, sd, "level {level} search_depth");
8610        assert_eq!(hc.target_len, tl, "level {level} target_len");
8611    }
8612}
8613
8614#[test]
8615fn level22_source_size_hint_uses_btultra2_tiers() {
8616    let p16k = resolve_level_params(CompressionLevel::Level(22), Some(16 * 1024));
8617    assert_eq!(p16k.window_log, 14);
8618    let hc16k = p16k.hc.unwrap();
8619    assert_eq!(hc16k.hash_log, 15);
8620    assert_eq!(hc16k.chain_log, 15);
8621    assert_eq!(hc16k.search_depth, 1 << 10);
8622    assert_eq!(hc16k.target_len, 999);
8623
8624    let p128k = resolve_level_params(CompressionLevel::Level(22), Some(128 * 1024));
8625    assert_eq!(p128k.window_log, 17);
8626    let hc128k = p128k.hc.unwrap();
8627    assert_eq!(hc128k.hash_log, 17);
8628    assert_eq!(hc128k.chain_log, 18);
8629    assert_eq!(hc128k.search_depth, 1 << 11);
8630    assert_eq!(hc128k.target_len, 999);
8631
8632    let p256k = resolve_level_params(CompressionLevel::Level(22), Some(256 * 1024));
8633    assert_eq!(p256k.window_log, 18);
8634    let hc256k = p256k.hc.unwrap();
8635    assert_eq!(hc256k.hash_log, 19);
8636    assert_eq!(hc256k.chain_log, 19);
8637    assert_eq!(hc256k.search_depth, 1 << 13);
8638    assert_eq!(hc256k.target_len, 999);
8639}
8640
8641#[test]
8642fn level22_non_power_of_two_small_source_uses_tier3_params() {
8643    // srcSize 15 027 (<= 16 KB) selects the table[3] btultra2 row; the
8644    // source-size clamp gives windowLog 14 (ceil log2 15027). Pure-Rust
8645    // assertion against the constant tier-3 geometry (no FFI).
8646    let source_size = 15_027u64;
8647    let params = resolve_level_params(CompressionLevel::Level(22), Some(source_size));
8648
8649    let hc = params.hc.unwrap();
8650    assert_eq!(params.window_log, 14);
8651    assert_eq!(hc.chain_log, 15);
8652    assert_eq!(hc.hash_log, 15);
8653    assert_eq!(hc.search_depth, 1 << 10);
8654    assert_eq!(HC_OPT_MIN_MATCH_LEN, 3);
8655    assert_eq!(hc.target_len, 999);
8656}
8657
8658#[test]
8659fn level22_small_source_uses_window_bounded_hash3_log() {
8660    let mut hc = HcMatchGenerator::new(1 << 14);
8661    hc.configure(
8662        BTULTRA2_HC_CONFIG_L22_16K,
8663        super::strategy::StrategyTag::BtUltra2,
8664        14,
8665    );
8666    assert_eq!(hc.table.hash3_log, 14);
8667
8668    hc.configure(
8669        BTULTRA2_HC_CONFIG_L22,
8670        super::strategy::StrategyTag::BtUltra2,
8671        27,
8672    );
8673    assert_eq!(hc.table.hash3_log, HC3_HASH_LOG);
8674}
8675
8676#[test]
8677fn btultra2_seed_pass_initializes_opt_state() {
8678    let mut hc = HcMatchGenerator::new(1 << 20);
8679    hc.configure(
8680        BTULTRA2_HC_CONFIG,
8681        super::strategy::StrategyTag::BtUltra2,
8682        26,
8683    );
8684    let data: Vec<u8> = (0..32 * 1024).map(|i| (i % 251) as u8).collect();
8685    hc.table.add_data(data, |_| {});
8686    hc.start_matching(|_| {});
8687    assert!(
8688        hc.backend.bt_mut().opt_state.lit_length_sum > 0,
8689        "btultra2 first block should seed non-zero sequence statistics"
8690    );
8691    assert!(
8692        hc.backend.bt_mut().opt_state.off_code_sum > 0,
8693        "btultra2 first block should seed offset-code statistics"
8694    );
8695}
8696
8697#[test]
8698fn btultra2_profile_disables_small_offset_handicap() {
8699    // Pre-Phase-3 this test duplicated the profile build with
8700    // `pass2=false` and `pass2=true` since `for_mode` differentiated
8701    // them. With `const_for_strategy::<BtUltra2>()` there is only one
8702    // profile — the upstream zstd `opt2` pricing — so a single binding
8703    // captures the invariant the test is asserting.
8704    let profile = HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra2>();
8705    assert!(
8706        !profile.favor_small_offsets,
8707        "btultra2 should match upstream zstd opt2 offset pricing"
8708    );
8709    assert!(
8710        profile.accurate,
8711        "btultra2 should use upstream zstd opt2 accurate pricing"
8712    );
8713}
8714
8715#[test]
8716fn btultra_profile_keeps_search_depth_budget() {
8717    let p = HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra>();
8718    assert_eq!(
8719        p.max_chain_depth, 64,
8720        "btultra chain-depth budget must match clevels.h level 18 searchLog 6 (1 << 6 = 64)"
8721    );
8722}
8723
8724#[test]
8725fn btopt_profile_keeps_search_depth_budget() {
8726    let p = HcOptimalCostProfile::const_for_strategy::<super::strategy::BtOpt>();
8727    assert_eq!(
8728        p.max_chain_depth, 32,
8729        "btopt should not cap chain depth below upstream zstd btopt search budget"
8730    );
8731}
8732
8733#[test]
8734fn sufficient_match_len_is_clamped_by_target_len() {
8735    let mut hc = HcMatchGenerator::new(1 << 20);
8736    hc.configure(
8737        BTULTRA2_HC_CONFIG,
8738        super::strategy::StrategyTag::BtUltra2,
8739        26,
8740    );
8741    hc.hc.target_len = 13;
8742    let profile = HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra2>();
8743    assert_eq!(hc.hc.sufficient_match_len_for_pass(profile), 13);
8744}
8745
8746#[test]
8747fn opt_modes_use_target_len_as_sufficient_len() {
8748    use super::strategy;
8749    let mut hc = HcMatchGenerator::new(1 << 20);
8750    hc.hc.target_len = 57;
8751    let profiles = [
8752        HcOptimalCostProfile::const_for_strategy::<strategy::BtOpt>(),
8753        HcOptimalCostProfile::const_for_strategy::<strategy::BtUltra>(),
8754        HcOptimalCostProfile::const_for_strategy::<strategy::BtUltra2>(),
8755    ];
8756    for profile in profiles {
8757        assert_eq!(hc.hc.sufficient_match_len_for_pass(profile), 57);
8758    }
8759}
8760
8761#[test]
8762fn sufficient_match_len_is_capped_by_opt_num() {
8763    let mut hc = HcMatchGenerator::new(1 << 20);
8764    hc.hc.target_len = usize::MAX / 2;
8765    let profile = HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra2>();
8766    assert_eq!(hc.hc.sufficient_match_len_for_pass(profile), HC_OPT_NUM - 1);
8767}
8768
8769#[test]
8770#[allow(clippy::borrow_deref_ref)]
8771fn dictionary_entropy_seed_initializes_opt_state_from_tables() {
8772    let mut hc = HcMatchGenerator::new(1 << 20);
8773    hc.configure(
8774        BTULTRA2_HC_CONFIG,
8775        super::strategy::StrategyTag::BtUltra2,
8776        26,
8777    );
8778
8779    let huff = crate::huff0::huff0_encoder::HuffmanTable::build_from_data(
8780        b"aaabbbbccccddddeeeeefffffgggg",
8781    );
8782    let ll = crate::fse::fse_encoder::default_ll_table();
8783    let ml = crate::fse::fse_encoder::default_ml_table();
8784    let of = crate::fse::fse_encoder::default_of_table();
8785    hc.seed_dictionary_entropy(Some(&huff), Some(&*ll), Some(&*ml), Some(&*of));
8786
8787    hc.backend.bt_mut().opt_state.rescale_freqs(
8788        b"abcd",
8789        HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra2>(),
8790    );
8791
8792    let base_ll_freqs: [u32; HC_MAX_LL + 1] = [
8793        4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8794        1, 1, 1, 1, 1, 1,
8795    ];
8796
8797    assert_ne!(
8798        hc.backend.bt_mut().opt_state.lit_length_freq,
8799        base_ll_freqs,
8800        "dictionary entropy should override fallback LL bootstrap frequencies"
8801    );
8802    assert!(
8803        hc.backend
8804            .bt_mut()
8805            .opt_state
8806            .match_length_freq
8807            .iter()
8808            .any(|&v| v != 1),
8809        "dictionary entropy should seed non-uniform ML frequencies"
8810    );
8811    assert_ne!(
8812        hc.backend.bt_mut().opt_state.off_code_freq[0],
8813        6,
8814        "dictionary entropy should override fallback OF bootstrap frequencies"
8815    );
8816}
8817
8818#[test]
8819#[allow(clippy::borrow_deref_ref)]
8820fn dictionary_fse_seed_applies_without_huffman_seed() {
8821    let mut hc = HcMatchGenerator::new(1 << 20);
8822    hc.configure(
8823        BTULTRA2_HC_CONFIG,
8824        super::strategy::StrategyTag::BtUltra2,
8825        26,
8826    );
8827
8828    let ll = crate::fse::fse_encoder::default_ll_table();
8829    let ml = crate::fse::fse_encoder::default_ml_table();
8830    let of = crate::fse::fse_encoder::default_of_table();
8831    hc.seed_dictionary_entropy(None, Some(&*ll), Some(&*ml), Some(&*of));
8832    hc.backend.bt_mut().opt_state.rescale_freqs(
8833        b"abcd",
8834        HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra2>(),
8835    );
8836
8837    let base_ll_freqs: [u32; HC_MAX_LL + 1] = [
8838        4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
8839        1, 1, 1, 1, 1, 1,
8840    ];
8841    assert_ne!(
8842        hc.backend.bt_mut().opt_state.lit_length_freq,
8843        base_ll_freqs,
8844        "FSE seed should still override LL bootstrap frequencies without huffman seed"
8845    );
8846    assert!(
8847        hc.backend
8848            .bt_mut()
8849            .opt_state
8850            .match_length_freq
8851            .iter()
8852            .any(|&v| v != 1),
8853        "FSE seed should still seed non-uniform ML frequencies"
8854    );
8855    assert_ne!(
8856        hc.backend.bt_mut().opt_state.off_code_freq[0],
8857        6,
8858        "FSE seed should still override OF bootstrap frequencies without huffman seed"
8859    );
8860}
8861
8862#[test]
8863#[allow(clippy::borrow_deref_ref)]
8864fn dictionary_seed_overrides_predef_price_mode_on_tiny_input() {
8865    let mut hc = HcMatchGenerator::new(1 << 20);
8866    hc.configure(
8867        BTULTRA2_HC_CONFIG,
8868        super::strategy::StrategyTag::BtUltra2,
8869        26,
8870    );
8871
8872    let ll = crate::fse::fse_encoder::default_ll_table();
8873    let ml = crate::fse::fse_encoder::default_ml_table();
8874    let of = crate::fse::fse_encoder::default_of_table();
8875    hc.seed_dictionary_entropy(None, Some(&*ll), Some(&*ml), Some(&*of));
8876    hc.backend.bt_mut().opt_state.rescale_freqs(
8877        b"abc",
8878        HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra2>(),
8879    );
8880    assert!(
8881        matches!(
8882            hc.backend.bt_mut().opt_state.price_type,
8883            HcOptPriceType::Dynamic
8884        ),
8885        "dictionary-seeded first block should stay in dynamic mode even for tiny src"
8886    );
8887}
8888
8889#[test]
8890fn lit_length_price_blocksize_max_costs_one_extra_bit() {
8891    let profile_predef = HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra2>();
8892    let mut stats_predef = HcOptState::new();
8893    stats_predef.price_type = HcOptPriceType::Predefined;
8894    let predef_max = profile_predef.lit_length_price(&stats_predef, HC_BLOCKSIZE_MAX);
8895    let predef_prev =
8896        profile_predef.lit_length_price(&stats_predef, HC_BLOCKSIZE_MAX.saturating_sub(1));
8897    assert_eq!(
8898        predef_max,
8899        predef_prev + HC_BITCOST_MULTIPLIER,
8900        "predefined litLength pricing at BLOCKSIZE_MAX must add exactly one bit"
8901    );
8902
8903    let profile_dyn = HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra2>();
8904    let mut stats_dyn = HcOptState::new();
8905    stats_dyn.price_type = HcOptPriceType::Dynamic;
8906    stats_dyn.lit_length_freq.fill(1);
8907    stats_dyn.lit_length_sum = (HC_MAX_LL + 1) as u32;
8908    stats_dyn.match_length_freq.fill(1);
8909    stats_dyn.match_length_sum = (HC_MAX_ML + 1) as u32;
8910    stats_dyn.off_code_freq.fill(1);
8911    stats_dyn.off_code_sum = (HC_MAX_OFF + 1) as u32;
8912    stats_dyn.lit_freq.fill(1);
8913    stats_dyn.lit_sum = (HC_MAX_LIT + 1) as u32;
8914    stats_dyn.set_base_prices(true);
8915    let dyn_max = profile_dyn.lit_length_price(&stats_dyn, HC_BLOCKSIZE_MAX);
8916    let dyn_prev = profile_dyn.lit_length_price(&stats_dyn, HC_BLOCKSIZE_MAX.saturating_sub(1));
8917    assert_eq!(
8918        dyn_max,
8919        dyn_prev + HC_BITCOST_MULTIPLIER,
8920        "dynamic litLength pricing at BLOCKSIZE_MAX must add exactly one bit"
8921    );
8922}
8923
8924#[test]
8925#[allow(clippy::borrow_deref_ref)]
8926fn btultra2_seed_pass_disabled_when_dictionary_entropy_seed_present() {
8927    let mut hc = HcMatchGenerator::new(1 << 20);
8928    hc.configure(
8929        BTULTRA2_HC_CONFIG,
8930        super::strategy::StrategyTag::BtUltra2,
8931        26,
8932    );
8933    let ll = crate::fse::fse_encoder::default_ll_table();
8934    let ml = crate::fse::fse_encoder::default_ml_table();
8935    let of = crate::fse::fse_encoder::default_of_table();
8936    hc.seed_dictionary_entropy(None, Some(&*ll), Some(&*ml), Some(&*of));
8937    assert!(
8938        !hc.should_run_btultra2_seed_pass::<super::strategy::BtUltra2>(HC_PREDEF_THRESHOLD + 1),
8939        "dictionary-seeded first block should skip btultra2 warmup pass"
8940    );
8941}
8942
8943#[test]
8944fn btultra2_seed_pass_disabled_when_prefix_history_exists() {
8945    let mut hc = HcMatchGenerator::new(1 << 20);
8946    hc.configure(
8947        BTULTRA2_HC_CONFIG,
8948        super::strategy::StrategyTag::BtUltra2,
8949        26,
8950    );
8951    hc.table.history_abs_start = 17;
8952    hc.table.push_test_chunk(b"abcdefghijklmnop".to_vec());
8953    assert!(
8954        !hc.should_run_btultra2_seed_pass::<super::strategy::BtUltra2>(HC_PREDEF_THRESHOLD + 9),
8955        "btultra2 warmup must be first-block only (no prefix history)"
8956    );
8957}
8958
8959#[test]
8960fn btultra2_seed_pass_disabled_for_tiny_block() {
8961    let mut hc = HcMatchGenerator::new(1 << 20);
8962    hc.configure(
8963        BTULTRA2_HC_CONFIG,
8964        super::strategy::StrategyTag::BtUltra2,
8965        26,
8966    );
8967    assert!(
8968        !hc.should_run_btultra2_seed_pass::<super::strategy::BtUltra2>(HC_PREDEF_THRESHOLD),
8969        "btultra2 warmup should not run at or below predefined threshold"
8970    );
8971}
8972
8973#[test]
8974fn btultra2_seed_pass_disabled_after_stats_initialized() {
8975    let mut hc = HcMatchGenerator::new(1 << 20);
8976    hc.configure(
8977        BTULTRA2_HC_CONFIG,
8978        super::strategy::StrategyTag::BtUltra2,
8979        26,
8980    );
8981    hc.backend.bt_mut().opt_state.lit_length_sum = 1;
8982    assert!(
8983        !hc.should_run_btultra2_seed_pass::<super::strategy::BtUltra2>(HC_PREDEF_THRESHOLD + 32),
8984        "btultra2 warmup should run only for first block before stats are initialized"
8985    );
8986}
8987
8988#[test]
8989fn btultra2_seed_pass_disabled_when_not_at_frame_start() {
8990    let mut hc = HcMatchGenerator::new(1 << 20);
8991    hc.configure(
8992        BTULTRA2_HC_CONFIG,
8993        super::strategy::StrategyTag::BtUltra2,
8994        26,
8995    );
8996    // Simulate non-first block state: current block has no prefix in deque,
8997    // but total produced window already includes prior output.
8998    hc.table.window_size = HC_PREDEF_THRESHOLD + 64;
8999    // window_size set manually above to simulate prior output; record the
9000    // current block as one live chunk (seed-pass check reads lengths, not bytes).
9001    hc.table.chunk_lens.push_back(HC_PREDEF_THRESHOLD + 32);
9002    assert!(
9003        !hc.should_run_btultra2_seed_pass::<super::strategy::BtUltra2>(HC_PREDEF_THRESHOLD + 32),
9004        "btultra2 warmup must not run after frame start"
9005    );
9006}
9007
9008#[test]
9009fn btultra2_seed_pass_disabled_when_ldm_sequences_exist() {
9010    let mut hc = HcMatchGenerator::new(1 << 20);
9011    hc.configure(
9012        BTULTRA2_HC_CONFIG,
9013        super::strategy::StrategyTag::BtUltra2,
9014        26,
9015    );
9016    hc.table.window_size = HC_PREDEF_THRESHOLD + 64;
9017    hc.table.chunk_lens.push_back(HC_PREDEF_THRESHOLD + 64);
9018    hc.backend.bt_mut().ldm_sequences.push(HcRawSeq {
9019        lit_length: 8,
9020        offset: 16,
9021        match_length: 32,
9022    });
9023    assert!(
9024        !hc.should_run_btultra2_seed_pass::<super::strategy::BtUltra2>(HC_PREDEF_THRESHOLD + 32),
9025        "btultra2 warmup must not run when LDM already produced sequences"
9026    );
9027}
9028
9029#[test]
9030fn literal_price_uses_eight_bits_when_literals_uncompressed() {
9031    let profile = HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra2>();
9032    let mut stats = HcOptState::new();
9033    stats.set_literals_compressed_for_tests(false);
9034    stats.price_type = HcOptPriceType::Predefined;
9035    assert_eq!(
9036        profile.literal_price(&stats, b'a'),
9037        8 * HC_BITCOST_MULTIPLIER,
9038        "uncompressed literals should cost 8 bits regardless of price mode"
9039    );
9040}
9041
9042#[test]
9043fn update_stats_skips_literal_frequencies_when_uncompressed() {
9044    let mut stats = HcOptState::new();
9045    stats.set_literals_compressed_for_tests(false);
9046    stats.update_stats(3, b"abc", 4, 8);
9047    assert_eq!(
9048        stats.lit_sum, 0,
9049        "literal sum must remain unchanged when literal compression is disabled"
9050    );
9051    assert_eq!(
9052        stats.lit_freq.iter().copied().sum::<u32>(),
9053        0,
9054        "literal frequencies must not be updated when literal compression is disabled"
9055    );
9056    assert_eq!(
9057        stats.lit_length_sum, 1,
9058        "literal-length stats still update for sequence modeling"
9059    );
9060    assert_eq!(
9061        stats.match_length_sum, 1,
9062        "match-length stats still update for sequence modeling"
9063    );
9064    assert_eq!(
9065        stats.off_code_sum, 1,
9066        "offset-code stats still update for sequence modeling"
9067    );
9068}
9069
9070#[test]
9071#[allow(clippy::borrow_deref_ref)]
9072fn dictionary_huffman_seed_ignored_when_literals_uncompressed() {
9073    let mut stats = HcOptState::new();
9074    stats.set_literals_compressed_for_tests(false);
9075    let huff = crate::huff0::huff0_encoder::HuffmanTable::build_from_data(
9076        b"aaaaabbbbcccddeeff00112233445566778899",
9077    );
9078    let ll = crate::fse::fse_encoder::default_ll_table();
9079    let ml = crate::fse::fse_encoder::default_ml_table();
9080    let of = crate::fse::fse_encoder::default_of_table();
9081    stats.seed_dictionary_entropy(Some(&huff), Some(&*ll), Some(&*ml), Some(&*of));
9082    stats.rescale_freqs(
9083        b"abcd",
9084        HcOptimalCostProfile::const_for_strategy::<super::strategy::BtUltra2>(),
9085    );
9086    assert_eq!(
9087        stats.lit_sum, 0,
9088        "literal sum must stay zero when literals are uncompressed"
9089    );
9090    assert_eq!(
9091        stats.lit_freq.iter().copied().sum::<u32>(),
9092        0,
9093        "literal frequencies must ignore dictionary huffman seed when uncompressed"
9094    );
9095}
9096
9097#[test]
9098fn hc_repcode_candidates_respect_litlen_dependent_rep_order() {
9099    let mut hc = HcMatchGenerator::new(64);
9100    hc.table.history = b"xxxxxxABCDEFABCDEF".to_vec();
9101    hc.table.history_start = 0;
9102    hc.table.history_abs_start = 0;
9103
9104    let abs_pos = 12usize; // points at second "ABCDEF"
9105    let current_abs_end = hc.table.history.len();
9106    let reps = [6u32, 3u32, 9u32];
9107
9108    let mut lit_pos_candidates = Vec::new();
9109    hc.hc.for_each_repcode_candidate_with_reps(
9110        &hc.table,
9111        abs_pos,
9112        1,
9113        reps,
9114        current_abs_end,
9115        HC_OPT_MIN_MATCH_LEN,
9116        |c| {
9117            lit_pos_candidates.push(c.offset);
9118        },
9119    );
9120    assert!(
9121        lit_pos_candidates.contains(&6),
9122        "when lit_len>0, rep0 should be considered and match"
9123    );
9124
9125    let mut ll0_candidates = Vec::new();
9126    hc.hc.for_each_repcode_candidate_with_reps(
9127        &hc.table,
9128        abs_pos,
9129        0,
9130        reps,
9131        current_abs_end,
9132        HC_OPT_MIN_MATCH_LEN,
9133        |c| {
9134            ll0_candidates.push(c.offset);
9135        },
9136    );
9137    assert!(
9138        !ll0_candidates.contains(&6),
9139        "when lit_len==0, rep0 is not directly eligible (ll0 semantics)"
9140    );
9141}
9142
9143#[test]
9144fn hc_collect_optimal_candidates_keeps_reps_when_chain_depth_zero() {
9145    let mut hc = HcMatchGenerator::new(64);
9146    hc.hc.search_depth = 0;
9147    hc.table.history = b"xyzxyzxyzxyz".to_vec();
9148    hc.table.history_start = 0;
9149    hc.table.history_abs_start = 0;
9150
9151    let abs_pos = 6usize;
9152    let current_abs_end = hc.table.history.len();
9153    let profile = HcOptimalCostProfile {
9154        max_chain_depth: 0,
9155        sufficient_match_len: usize::MAX / 2,
9156        accurate: false,
9157        favor_small_offsets: false,
9158    };
9159    let mut out = Vec::new();
9160    hc.collect_optimal_candidates(
9161        abs_pos,
9162        current_abs_end,
9163        profile,
9164        HcCandidateQuery {
9165            reps: [3, 6, 9],
9166            lit_len: 1,
9167            ldm_candidate: None,
9168        },
9169        &mut out,
9170    );
9171    assert!(
9172        !out.is_empty(),
9173        "rep candidates should remain available even when chain depth is zero"
9174    );
9175    assert!(
9176        out.iter().any(|c| c.offset == 3),
9177        "rep0 candidate should be retained"
9178    );
9179}
9180
9181#[test]
9182fn hc_collect_optimal_candidates_rep_tail_match_skips_chain_probe() {
9183    let mut hc = HcMatchGenerator::new(64);
9184    hc.table.history = b"aaaaaaaaaa".to_vec();
9185    hc.table.history_start = 0;
9186    hc.table.history_abs_start = 0;
9187    hc.table.position_base = 0;
9188    hc.hc.search_depth = 32;
9189    let abs_pos = 6usize;
9190    hc.table.ensure_tables();
9191    hc.table.insert_positions(0, abs_pos);
9192
9193    let profile = HcOptimalCostProfile {
9194        max_chain_depth: 32,
9195        sufficient_match_len: usize::MAX / 2,
9196        accurate: true,
9197        favor_small_offsets: false,
9198    };
9199    let mut out = Vec::new();
9200    hc.collect_optimal_candidates(
9201        abs_pos,
9202        hc.table.history.len(),
9203        profile,
9204        HcCandidateQuery {
9205            reps: [1, 4, 8],
9206            lit_len: 1,
9207            ldm_candidate: None,
9208        },
9209        &mut out,
9210    );
9211
9212    assert!(
9213        out.iter()
9214            .all(|candidate| matches!(candidate.offset, 1 | 4)),
9215        "terminal rep match should return before chain probing adds non-rep offsets"
9216    );
9217}
9218
9219#[test]
9220fn hc_collect_optimal_candidates_long_chain_match_advances_skip_window() {
9221    let mut hc = HcMatchGenerator::new(128);
9222    hc.table.history = b"abcabcabcabcabcabcabcabc".to_vec();
9223    hc.table.history_start = 0;
9224    hc.table.history_abs_start = 0;
9225    hc.table.position_base = 0;
9226    hc.hc.search_depth = 32;
9227    let abs_pos = 9usize;
9228    hc.table.ensure_tables();
9229    hc.table.insert_positions(0, abs_pos);
9230    hc.table.skip_insert_until_abs = 0;
9231
9232    let profile = HcOptimalCostProfile {
9233        max_chain_depth: 32,
9234        sufficient_match_len: usize::MAX / 2,
9235        accurate: true,
9236        favor_small_offsets: false,
9237    };
9238    let mut out = Vec::new();
9239    hc.collect_optimal_candidates(
9240        abs_pos,
9241        hc.table.history.len(),
9242        profile,
9243        HcCandidateQuery {
9244            reps: [1, 4, 8],
9245            lit_len: 1,
9246            ldm_candidate: None,
9247        },
9248        &mut out,
9249    );
9250
9251    assert!(
9252        hc.table.skip_insert_until_abs > abs_pos,
9253        "long chain match should advance skip window to avoid redundant immediate insertions"
9254    );
9255}
9256
9257#[test]
9258fn hc_collect_optimal_candidates_chain_fast_skip_uses_match_end_minus_8() {
9259    let mut hc = HcMatchGenerator::new(128);
9260    hc.table.history = b"abcabcabcabcabcabcabcabc".to_vec();
9261    hc.table.history_start = 0;
9262    hc.table.history_abs_start = 0;
9263    hc.table.position_base = 0;
9264    hc.hc.search_depth = 32;
9265    let abs_pos = 9usize;
9266    hc.table.ensure_tables();
9267    hc.table.insert_positions(0, abs_pos);
9268    hc.table.skip_insert_until_abs = 0;
9269
9270    let profile = HcOptimalCostProfile {
9271        max_chain_depth: 32,
9272        sufficient_match_len: 10,
9273        accurate: true,
9274        favor_small_offsets: false,
9275    };
9276    let mut out = Vec::new();
9277    hc.collect_optimal_candidates(
9278        abs_pos,
9279        hc.table.history.len(),
9280        profile,
9281        HcCandidateQuery {
9282            reps: [1, 4, 8],
9283            lit_len: 1,
9284            ldm_candidate: None,
9285        },
9286        &mut out,
9287    );
9288
9289    let best_match_end = out
9290        .iter()
9291        .map(|candidate| candidate.start.saturating_add(candidate.match_len))
9292        .max()
9293        .expect("expected at least one candidate");
9294    assert!(
9295        hc.table.skip_insert_until_abs > abs_pos,
9296        "chain fast-skip must advance past current position"
9297    );
9298    assert!(
9299        hc.table.skip_insert_until_abs <= best_match_end.saturating_sub(8),
9300        "chain fast-skip must not exceed upstream zstd-style matchEndIdx - 8 bound"
9301    );
9302}
9303
9304#[test]
9305fn hc_collect_optimal_candidates_advances_skip_window_on_plain_bt_path() {
9306    let mut hc = HcMatchGenerator::new(256);
9307    hc.table.history = b"abcdefghijklmnop".to_vec();
9308    hc.table.history_start = 0;
9309    hc.table.history_abs_start = 0;
9310    hc.table.position_base = 0;
9311    hc.hc.search_depth = 0;
9312    hc.table.ensure_tables();
9313
9314    let abs_pos = 8usize;
9315    hc.table.skip_insert_until_abs = 0;
9316
9317    let profile = HcOptimalCostProfile {
9318        max_chain_depth: 0,
9319        sufficient_match_len: usize::MAX / 2,
9320        accurate: true,
9321        favor_small_offsets: false,
9322    };
9323    let mut out = Vec::new();
9324    hc.collect_optimal_candidates(
9325        abs_pos,
9326        hc.table.history.len(),
9327        profile,
9328        HcCandidateQuery {
9329            reps: [1, 4, 8],
9330            lit_len: 1,
9331            ldm_candidate: None,
9332        },
9333        &mut out,
9334    );
9335
9336    assert_eq!(
9337        hc.table.skip_insert_until_abs,
9338        abs_pos.saturating_add(1),
9339        "plain BT path should advance skip window by 1 via upstream zstd matchEndIdx baseline"
9340    );
9341}
9342
9343// Removed: the three `hc_collect_optimal_candidates_*_hash3_*` /
9344// `hc_hash3_tail_match_*` tests forced `search_depth = 0` together
9345// with `hash3_log != 0`, an HC-chain-walker-only fixture state that
9346// production never reaches (hash3 is BtUltra2-only and BtUltra2 always
9347// runs `search_depth = 512`). They depended on the `has_hash3 =>
9348// BtUltra2` escape hatch in the test dispatcher; with that hatch gone
9349// (CR review on PR #123) and the dispatcher routing purely from
9350// `self.strategy_tag`, there is no production-shaped configuration
9351// that reproduces what those tests asserted. The corresponding hash3
9352// invariants are exercised end-to-end by the existing level22 roundtrip
9353// + upstream zstd-parity ratio gate.
9354
9355#[test]
9356fn hc_ldm_candidates_are_merged_into_optimal_candidates() {
9357    let mut hc = HcMatchGenerator::new(512);
9358    hc.table.history = (0..256).map(|i| (i % 251) as u8).collect();
9359    hc.table.history_start = 0;
9360    hc.table.history_abs_start = 0;
9361
9362    let abs_pos = 128usize;
9363    let current_abs_end = 256usize;
9364    let ldm = MatchCandidate {
9365        start: abs_pos,
9366        offset: 96,
9367        match_len: 40,
9368    };
9369
9370    let profile = HcOptimalCostProfile {
9371        max_chain_depth: 0,
9372        sufficient_match_len: usize::MAX / 2,
9373        accurate: true,
9374        favor_small_offsets: false,
9375    };
9376    let mut out = Vec::new();
9377    hc.collect_optimal_candidates(
9378        abs_pos,
9379        current_abs_end,
9380        profile,
9381        HcCandidateQuery {
9382            reps: [1, 4, 8],
9383            lit_len: 1,
9384            ldm_candidate: Some(ldm),
9385        },
9386        &mut out,
9387    );
9388    assert!(
9389        out.iter().any(
9390            |candidate| candidate.offset == ldm.offset && candidate.match_len == ldm.match_len
9391        ),
9392        "LDM candidate should be present in optimal candidate set"
9393    );
9394}
9395
9396#[test]
9397fn btultra_and_btultra2_both_keep_dictionary_candidates() {
9398    // Routes the BtUltra2 / BtUltra fixture through the production
9399    // `configure()` path so derived state (`hash3_log`, `is_btultra2`,
9400    // `uses_bt`, `backend`) stays consistent — manually flipping the
9401    // strategy flags here used to leave `hash3_log` / `hash3_table` in
9402    // the previous mode's shape and trip the
9403    // `Strategy::USE_HASH3 ⇒ hash3_log != 0` debug invariant inside
9404    // `collect_optimal_candidates_initialized_body`.
9405    use super::strategy::StrategyTag;
9406
9407    let test_config = HcConfig {
9408        hash_log: 23,
9409        chain_log: 22,
9410        search_depth: 32,
9411        target_len: 256,
9412        search_mls: 4,
9413    };
9414    let window_log = 20u8;
9415
9416    let prepare_history = |hc: &mut HcMatchGenerator, abs_pos: usize| {
9417        hc.table.history = alloc::vec![0u8; 160];
9418        for i in 0..64 {
9419            hc.table.history[i] = b'a' + (i % 7) as u8;
9420        }
9421        for i in 64..160 {
9422            hc.table.history[i] = b'k' + (i % 5) as u8;
9423        }
9424        for i in 0..24 {
9425            hc.table.history[abs_pos + i] = hc.table.history[16 + i];
9426        }
9427        hc.table.history_start = 0;
9428        hc.table.history_abs_start = 0;
9429        hc.table.position_base = 0;
9430        hc.table.ensure_tables();
9431        hc.table.insert_positions(0, abs_pos);
9432        hc.table.dictionary_limit_abs = Some(64);
9433        hc.table.skip_insert_until_abs = 0;
9434    };
9435
9436    let profile = HcOptimalCostProfile {
9437        max_chain_depth: 32,
9438        sufficient_match_len: usize::MAX / 2,
9439        accurate: true,
9440        favor_small_offsets: false,
9441    };
9442    let abs_pos = 96usize;
9443    let mut out = Vec::new();
9444
9445    let mut hc = HcMatchGenerator::new(256);
9446    hc.configure(test_config, StrategyTag::BtUltra2, window_log);
9447    prepare_history(&mut hc, abs_pos);
9448    hc.collect_optimal_candidates(
9449        abs_pos,
9450        160,
9451        profile,
9452        HcCandidateQuery {
9453            reps: [1, 4, 8],
9454            lit_len: 1,
9455            ldm_candidate: None,
9456        },
9457        &mut out,
9458    );
9459    assert!(
9460        out.iter().any(|candidate| candidate.offset >= 32),
9461        "btultra2 should retain dictionary candidates on upstream zstd-parity path"
9462    );
9463
9464    let mut hc = HcMatchGenerator::new(256);
9465    hc.configure(test_config, StrategyTag::BtUltra, window_log);
9466    prepare_history(&mut hc, abs_pos);
9467    hc.collect_optimal_candidates(
9468        abs_pos,
9469        160,
9470        profile,
9471        HcCandidateQuery {
9472            reps: [1, 4, 8],
9473            lit_len: 1,
9474            ldm_candidate: None,
9475        },
9476        &mut out,
9477    );
9478    assert!(
9479        out.iter().any(|candidate| candidate.offset >= 32),
9480        "btultra should retain dictionary candidates"
9481    );
9482}
9483
9484#[test]
9485fn driver_small_source_hint_shrinks_dfast_hash_tables() {
9486    let mut driver = MatchGeneratorDriver::new(32, 2);
9487
9488    driver.reset(CompressionLevel::Level(3));
9489    let mut space = driver.get_next_space();
9490    space[..12].copy_from_slice(b"abcabcabcabc");
9491    space.truncate(12);
9492    driver.commit_space(space);
9493    driver.skip_matching_with_hint(None);
9494    // Upstream zstd-parity split sizes: long-hash = DFAST_HASH_BITS,
9495    // short-hash = DFAST_HASH_BITS - DFAST_SHORT_HASH_BITS_DELTA.
9496    let full_long = driver.dfast_matcher().long_len();
9497    let full_short = driver.dfast_matcher().short_len();
9498    assert_eq!(full_long, 1 << DFAST_HASH_BITS);
9499    assert_eq!(
9500        full_short,
9501        1 << (DFAST_HASH_BITS - DFAST_SHORT_HASH_BITS_DELTA)
9502    );
9503
9504    driver.set_source_size_hint(1024);
9505    driver.reset(CompressionLevel::Level(3));
9506    let mut space = driver.get_next_space();
9507    space[..12].copy_from_slice(b"xyzxyzxyzxyz");
9508    space.truncate(12);
9509    driver.commit_space(space);
9510    driver.skip_matching_with_hint(None);
9511    let hinted_long = driver.dfast_matcher().long_len();
9512    let hinted_short = driver.dfast_matcher().short_len();
9513
9514    // The wire `window_log` stays at its floor (decoder-interop), but the
9515    // internal dfast tables are sized from the RAW 1 KiB source, not the
9516    // floored window: `table_window = 1 << ceil_log2(1024) = 1 << 10`, so
9517    // both tables land at the `MIN_WINDOW_LOG` floor (the long table at
9518    // `dfast_hash_bits_for_window(1 << 10) = 10`, the short table one
9519    // `DFAST_SHORT_HASH_BITS_DELTA` step below but clamped back up to
9520    // `MIN_WINDOW_LOG`).
9521    assert_eq!(driver.window_size(), 1 << MIN_HINTED_WINDOW_LOG);
9522    assert_eq!(hinted_long, 1 << MIN_WINDOW_LOG);
9523    assert_eq!(hinted_short, 1 << MIN_WINDOW_LOG);
9524    assert!(
9525        hinted_long < full_long && hinted_short < full_short,
9526        "tiny source hint should reduce both dfast tables"
9527    );
9528}
9529
9530#[test]
9531fn driver_huge_source_hint_does_not_overflow_table_window_shift() {
9532    // Regression: the Dfast / Row table-window sizing in `reset` derives a
9533    // shift from `ceil_log2(hint)`. A hint >= 2^63 + 1 makes that shift 64,
9534    // and `1usize << 64` panics in debug / wraps to 0 in release before the
9535    // `.min(max_window_size)` cap can apply. A `u64::MAX` pledged source size
9536    // must size the table to the real window, never panic or wrap to zero.
9537    let mut driver = MatchGeneratorDriver::new(32, 2);
9538    driver.set_source_size_hint(u64::MAX);
9539    driver.reset(CompressionLevel::Level(3));
9540
9541    let mut space = driver.get_next_space();
9542    space[..12].copy_from_slice(b"abcabcabcabc");
9543    space.truncate(12);
9544    driver.commit_space(space);
9545    driver.skip_matching_with_hint(None);
9546
9547    assert!(
9548        driver.dfast_matcher().long_len() >= 1 << MIN_WINDOW_LOG,
9549        "huge hint must size the dfast table from the real window, not wrap to zero"
9550    );
9551}
9552
9553#[test]
9554fn driver_huge_source_hint_with_dict_does_not_overflow_hc_reserve() {
9555    // Regression: the HC/BT history-mirror pre-size adds the dictionary
9556    // hint to the source-size hint before `reserve_history` clamps to the
9557    // window ceiling. A `u64::MAX` pledged source size (the "unknown size"
9558    // sentinel) plus any positive dictionary hint overflows `usize` in
9559    // `(src as usize) + dict_hint` — debug panic / release wrap on 64-bit,
9560    // and `src as usize` truncation on 32-bit targets. Level 16 (BtOpt)
9561    // routes through the HashChain/BT storage arm that owns this reserve.
9562    // Must size the mirror to the real window, never panic, wrap, or
9563    // truncate.
9564    let mut driver = MatchGeneratorDriver::new(32, 2);
9565    driver.set_source_size_hint(u64::MAX);
9566    driver.set_dictionary_size_hint(64 * 1024);
9567    driver.reset(CompressionLevel::Level(16));
9568
9569    // The saturated `usize::MAX` reserve target must be clamped to the HC
9570    // history ceiling, not reserved literally (which would OOM/panic). Level 16
9571    // has window_log 22, so the ceiling is `window + window/4 + one block`
9572    // (the `reserve_history` formula). Assert the reserve actually reached it —
9573    // a no-panic-only check would also pass on an under-reserved mirror.
9574    let window = 1usize << 22;
9575    let expected_history_ceiling = window + (window >> 2) + crate::common::MAX_BLOCK_SIZE as usize;
9576    assert!(
9577        driver.hc_matcher().table.history.capacity() >= expected_history_ceiling,
9578        "huge source + dict hint must reserve the clamped HC history ceiling, got {}",
9579        driver.hc_matcher().table.history.capacity()
9580    );
9581
9582    let mut space = driver.get_next_space();
9583    space[..12].copy_from_slice(b"abcabcabcabc");
9584    space.truncate(12);
9585    driver.commit_space(space);
9586    driver.skip_matching_with_hint(None);
9587}
9588
9589#[test]
9590fn driver_chain_log_override_survives_row_to_hc_fallback() {
9591    // Regression: when a RowHash level is forced onto the HashChain backend
9592    // (resolved window <= 14, upstream `ZSTD_resolveRowMatchFinderMode`), the
9593    // synthesised HC chain table must honour an explicit `chain_log` override.
9594    // The RowHash override arm drops `chain_log` (Row has no chain table), so
9595    // the synthesis previously replaced the caller's `chain_log` with the upstream zstd
9596    // `hashLog - 1`, silently ignoring it on small-window frames.
9597    let chain_log_override = 10u32;
9598    let ov = super::parameters::ParamOverrides {
9599        chain_log: Some(chain_log_override),
9600        ..Default::default()
9601    };
9602    let mut driver = MatchGeneratorDriver::new(32, 2);
9603    // Small source hint pins the window to the hinted floor (16 KiB =
9604    // windowLog 14), so the Level 6 Row finder falls back to HashChain.
9605    driver.set_source_size_hint(1 << 12);
9606    driver.set_param_overrides(Some(ov));
9607    driver.reset(CompressionLevel::Level(6));
9608    let mut space = driver.get_next_space();
9609    space[..12].copy_from_slice(b"abcabcabcabc");
9610    space.truncate(12);
9611    driver.commit_space(space);
9612    driver.skip_matching_with_hint(None);
9613    // The override (10) is below the window cap (14), so the resolved HC chain
9614    // table must reflect it — NOT the upstream zstd `hashLog - 1` (18, clamped to the
9615    // window 14). Pre-fix this resolved to 14.
9616    assert_eq!(
9617        driver.hc_matcher().table.chain_log,
9618        chain_log_override as usize,
9619        "explicit chain_log override must survive the Row->HC fallback, got {}",
9620        driver.hc_matcher().table.chain_log
9621    );
9622}
9623
9624#[test]
9625fn driver_small_source_hint_shrinks_row_hash_tables() {
9626    let mut driver = MatchGeneratorDriver::new(32, 2);
9627
9628    driver.reset(CompressionLevel::Level(5));
9629    let mut space = driver.get_next_space();
9630    space[..12].copy_from_slice(b"abcabcabcabc");
9631    space.truncate(12);
9632    driver.commit_space(space);
9633    driver.skip_matching_with_hint(None);
9634    let full_rows = driver.row_matcher().row_heads.len();
9635    // Level 5 uses the upstream row_log (clamp(searchLog=3, 4, 6) = 4) and the
9636    // upstream L5 hashLog (`ZSTD_getCParams(5,..).hashLog` = 19), so the row
9637    // count is 1 << (ROW_L5.hash_bits - ROW_L5.row_log).
9638    assert_eq!(full_rows, 1 << (ROW_L5.hash_bits - ROW_L5.row_log));
9639
9640    // A hint that keeps the resolved window > 14 STILL uses the Row finder
9641    // (upstream `ZSTD_resolveRowMatchFinderMode`: row mode on for windowLog > 14)
9642    // and shrinks the row hash table to the source-derived width. 64 KiB →
9643    // raw source log 16, so `row_hash_bits_for_window(1 << 16)` < the level's
9644    // full hash_bits (19) and the row count drops.
9645    driver.set_source_size_hint(1 << 16);
9646    driver.reset(CompressionLevel::Level(5));
9647    let mut space = driver.get_next_space();
9648    space[..12].copy_from_slice(b"xyzxyzxyzxyz");
9649    space.truncate(12);
9650    driver.commit_space(space);
9651    driver.skip_matching_with_hint(None);
9652    assert_eq!(
9653        driver.active_backend(),
9654        super::strategy::BackendTag::Row,
9655        "windowLog > 14 keeps the upstream row matchfinder"
9656    );
9657    let hinted_rows = driver.row_matcher().row_heads.len();
9658    assert!(
9659        hinted_rows < full_rows,
9660        "a window>14 source hint should reduce the row hash table footprint"
9661    );
9662
9663    // A tiny hint floors the resolved window at MIN_HINTED_WINDOW_LOG = 14;
9664    // upstream uses the HASH-CHAIN matcher (not Row) at windowLog <= 14, so the
9665    // driver must route greedy/lazy/lazy2 to the HashChain backend there.
9666    driver.set_source_size_hint(1024);
9667    driver.reset(CompressionLevel::Level(5));
9668    assert_eq!(driver.window_size(), 1 << MIN_HINTED_WINDOW_LOG);
9669    assert_eq!(
9670        driver.active_backend(),
9671        super::strategy::BackendTag::HashChain,
9672        "windowLog <= 14 must fall back to the upstream zstd hash-chain matchfinder",
9673    );
9674}
9675
9676#[test]
9677fn row_matches_roundtrip_multi_block_pattern() {
9678    let pattern = [7, 13, 44, 184, 19, 96, 171, 109, 141, 251];
9679    let first_block: Vec<u8> = pattern.iter().copied().cycle().take(128 * 1024).collect();
9680    let second_block: Vec<u8> = pattern.iter().copied().cycle().take(128 * 1024).collect();
9681
9682    let mut matcher = RowMatchGenerator::new(1 << 22);
9683    matcher.configure(ROW_CONFIG);
9684    matcher.ensure_tables();
9685    let replay_sequence = |decoded: &mut Vec<u8>, seq: Sequence<'_>| match seq {
9686        Sequence::Literals { literals } => decoded.extend_from_slice(literals),
9687        Sequence::Triple {
9688            literals,
9689            offset,
9690            match_len,
9691        } => {
9692            decoded.extend_from_slice(literals);
9693            let start = decoded.len() - offset;
9694            for i in 0..match_len {
9695                let byte = decoded[start + i];
9696                decoded.push(byte);
9697            }
9698        }
9699    };
9700
9701    matcher.add_data(first_block.clone(), |_| {});
9702    let mut history = Vec::new();
9703    matcher.start_matching(|seq| replay_sequence(&mut history, seq));
9704    assert_eq!(history, first_block);
9705
9706    matcher.add_data(second_block.clone(), |_| {});
9707    let prefix_len = history.len();
9708    matcher.start_matching(|seq| replay_sequence(&mut history, seq));
9709
9710    assert_eq!(&history[prefix_len..], second_block.as_slice());
9711
9712    // Force a literals-only pass so the Sequence::Literals arm is exercised.
9713    let third_block: Vec<u8> = (0u8..=255).collect();
9714    matcher.add_data(third_block.clone(), |_| {});
9715    let third_prefix = history.len();
9716    matcher.start_matching(|seq| replay_sequence(&mut history, seq));
9717    assert_eq!(&history[third_prefix..], third_block.as_slice());
9718}
9719
9720#[test]
9721fn row_short_block_emits_literals_only() {
9722    let mut matcher = RowMatchGenerator::new(1 << 22);
9723    matcher.configure(ROW_CONFIG);
9724
9725    matcher.add_data(b"abcde".to_vec(), |_| {});
9726
9727    let mut saw_triple = false;
9728    let mut reconstructed = Vec::new();
9729    matcher.start_matching(|seq| match seq {
9730        Sequence::Literals { literals } => reconstructed.extend_from_slice(literals),
9731        Sequence::Triple { .. } => saw_triple = true,
9732    });
9733
9734    assert!(
9735        !saw_triple,
9736        "row backend must not emit triples for short blocks"
9737    );
9738    assert_eq!(reconstructed, b"abcde");
9739
9740    // Then feed a clearly matchable block and ensure the Triple arm is reachable.
9741    saw_triple = false;
9742    matcher.add_data(b"abcdeabcde".to_vec(), |_| {});
9743    matcher.start_matching(|seq| {
9744        if let Sequence::Triple { .. } = seq {
9745            saw_triple = true;
9746        }
9747    });
9748    assert!(
9749        saw_triple,
9750        "row backend should emit triples on repeated data"
9751    );
9752}
9753
9754#[test]
9755fn row_pick_lazy_returns_best_when_lookahead_is_out_of_bounds() {
9756    let mut matcher = RowMatchGenerator::new(1 << 22);
9757    matcher.configure(ROW_CONFIG);
9758    matcher.add_data(b"abcabc".to_vec(), |_| {});
9759    // Build the row tables before probing: the lookahead path reaches
9760    // `row_candidate` -> `row_heads[..]` once the accept floor is small
9761    // enough to pass the length gate, so the tables must be allocated
9762    // (production always calls this before any candidate probe).
9763    matcher.ensure_tables();
9764
9765    let best = MatchCandidate {
9766        start: 0,
9767        offset: 1,
9768        match_len: ROW_MIN_MATCH_LEN,
9769    };
9770    let picked = matcher
9771        .pick_lazy_match(0, 0, Some(best))
9772        .expect("best candidate must survive");
9773
9774    assert_eq!(picked.start, best.start);
9775    assert_eq!(picked.offset, best.offset);
9776    assert_eq!(picked.match_len, best.match_len);
9777}
9778
9779#[test]
9780fn row_backfills_previous_block_tail_for_cross_boundary_match() {
9781    let mut matcher = RowMatchGenerator::new(1 << 22);
9782    matcher.configure(ROW_CONFIG);
9783
9784    let mut first_block = alloc::vec![0xA5; 64];
9785    first_block.extend_from_slice(b"XYZ");
9786    let second_block = b"XYZXYZtail".to_vec();
9787
9788    let replay_sequence = |decoded: &mut Vec<u8>, seq: Sequence<'_>| match seq {
9789        Sequence::Literals { literals } => decoded.extend_from_slice(literals),
9790        Sequence::Triple {
9791            literals,
9792            offset,
9793            match_len,
9794        } => {
9795            decoded.extend_from_slice(literals);
9796            let start = decoded.len() - offset;
9797            for i in 0..match_len {
9798                let byte = decoded[start + i];
9799                decoded.push(byte);
9800            }
9801        }
9802    };
9803
9804    matcher.add_data(first_block.clone(), |_| {});
9805    let mut reconstructed = Vec::new();
9806    matcher.start_matching(|seq| replay_sequence(&mut reconstructed, seq));
9807    assert_eq!(reconstructed, first_block);
9808
9809    matcher.add_data(second_block.clone(), |_| {});
9810    let mut saw_cross_boundary = false;
9811    let prefix_len = reconstructed.len();
9812    matcher.start_matching(|seq| {
9813        if let Sequence::Triple {
9814            literals,
9815            offset,
9816            match_len,
9817        } = seq
9818            && literals.is_empty()
9819            && offset == 3
9820            && match_len >= ROW_MIN_MATCH_LEN
9821        {
9822            saw_cross_boundary = true;
9823        }
9824        replay_sequence(&mut reconstructed, seq);
9825    });
9826
9827    assert!(
9828        saw_cross_boundary,
9829        "row matcher should reuse the 3-byte previous-block tail"
9830    );
9831    assert_eq!(&reconstructed[prefix_len..], second_block.as_slice());
9832}
9833
9834#[test]
9835fn row_skip_matching_with_incompressible_hint_uses_sparse_prefix() {
9836    let data = deterministic_high_entropy_bytes(0xA713_9C5D_44E2_10B1, 4096);
9837
9838    let mut dense = RowMatchGenerator::new(1 << 22);
9839    dense.configure(ROW_CONFIG);
9840    dense.add_data(data.clone(), |_| {});
9841    dense.skip_matching_with_hint(Some(false));
9842    let dense_slots = dense
9843        .row_positions
9844        .iter()
9845        .filter(|&&pos| pos != ROW_EMPTY_SLOT)
9846        .count();
9847
9848    let mut sparse = RowMatchGenerator::new(1 << 22);
9849    sparse.configure(ROW_CONFIG);
9850    sparse.add_data(data, |_| {});
9851    sparse.skip_matching_with_hint(Some(true));
9852    let sparse_slots = sparse
9853        .row_positions
9854        .iter()
9855        .filter(|&&pos| pos != ROW_EMPTY_SLOT)
9856        .count();
9857
9858    assert!(
9859        sparse_slots < dense_slots,
9860        "incompressible hint should seed fewer row slots (sparse={sparse_slots}, dense={dense_slots})"
9861    );
9862}
9863
9864/// Regression for the `None` arm of `skip_matching_with_hint`: the
9865/// row table must NOT receive dense inserts across the skipped range.
9866/// Upstream zstd parity (`ZSTD_row_fillHashCache` only pre-fills the next-scan
9867/// cache, not the skipped block's interior) trades cross-block
9868/// matches into the skipped interior for the per-block O(block_size)
9869/// insert cost.
9870///
9871/// At input < 1 block (4096 B with default 128 KiB block boundary),
9872/// the only positions in the row table after the call should be those
9873/// produced by the `backfill_start` lookback at the block's start
9874/// (≤ `ROW_HASH_KEY_LEN - 1` positions when block_start <
9875/// ROW_HASH_KEY_LEN). For `current_abs_start == 0`, even that backfill
9876/// is empty — so the table stays fully empty.
9877#[test]
9878fn row_skip_matching_with_none_hint_leaves_interior_empty() {
9879    let data = deterministic_high_entropy_bytes(0x9B47_F2A1_8C5E_3306, 4096);
9880
9881    let mut none_hint = RowMatchGenerator::new(1 << 22);
9882    none_hint.configure(ROW_CONFIG);
9883    none_hint.add_data(data.clone(), |_| {});
9884    none_hint.skip_matching_with_hint(None);
9885    let none_slots = none_hint
9886        .row_positions
9887        .iter()
9888        .filter(|&&pos| pos != ROW_EMPTY_SLOT)
9889        .count();
9890
9891    // Dense (Some(false), dict-priming path) for comparison — that
9892    // path inserts every position in the skipped range.
9893    let mut dense = RowMatchGenerator::new(1 << 22);
9894    dense.configure(ROW_CONFIG);
9895    dense.add_data(data, |_| {});
9896    dense.skip_matching_with_hint(Some(false));
9897    let dense_slots = dense
9898        .row_positions
9899        .iter()
9900        .filter(|&&pos| pos != ROW_EMPTY_SLOT)
9901        .count();
9902
9903    // Two assertions pin the contract:
9904    // 1) None hint is dramatically sparser than dense (the whole point).
9905    // 2) None hint at block-start==0 inserts ZERO positions (no
9906    //    backfill possible before position 0).
9907    assert_eq!(
9908        none_slots, 0,
9909        "None hint at block_start=0 must leave row table fully empty \
9910         (upstream zstd parity — interior NOT inserted, no pre-block backfill possible)",
9911    );
9912    assert!(
9913        dense_slots > 0,
9914        "Some(false) dict-priming path must still insert densely \
9915         (sanity check: control case for the `none_slots == 0` assertion)",
9916    );
9917}
9918
9919#[test]
9920fn driver_unhinted_level2_keeps_default_dfast_hash_table_size() {
9921    let mut driver = MatchGeneratorDriver::new(32, 2);
9922
9923    driver.reset(CompressionLevel::Level(3));
9924    let mut space = driver.get_next_space();
9925    space[..12].copy_from_slice(b"abcabcabcabc");
9926    space.truncate(12);
9927    driver.commit_space(space);
9928    driver.skip_matching_with_hint(None);
9929
9930    // Upstream zstd-parity split: long-hash at DFAST_HASH_BITS, short-hash one
9931    // bit smaller (DFAST_SHORT_HASH_BITS_DELTA = 1, matching upstream zstd
9932    // `chainLog = hashLog - 1` for dfast levels).
9933    let long_len = driver.dfast_matcher().long_len();
9934    let short_len = driver.dfast_matcher().short_len();
9935    assert_eq!(
9936        long_len,
9937        1 << DFAST_HASH_BITS,
9938        "unhinted Level(2) should keep default long-hash table size"
9939    );
9940    assert_eq!(
9941        short_len,
9942        1 << (DFAST_HASH_BITS - DFAST_SHORT_HASH_BITS_DELTA),
9943        "unhinted Level(2) short-hash should be one bit smaller than long-hash"
9944    );
9945}
9946
9947#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
9948#[test]
9949fn simple_backend_rejects_undersized_pooled_suffix_store() {
9950    let mut driver = MatchGeneratorDriver::new(128 * 1024, 2);
9951    driver.reset(CompressionLevel::Fastest);
9952
9953    driver.suffix_pool.push(SuffixStore::with_capacity(1024));
9954
9955    let mut space = driver.get_next_space();
9956    space.clear();
9957    space.resize(4096, 0xAB);
9958    driver.commit_space(space);
9959
9960    let last_suffix_slots = driver
9961        .simple()
9962        .window
9963        .last()
9964        .expect("window entry must exist after commit")
9965        .suffixes
9966        .slots
9967        .len();
9968    assert!(
9969        last_suffix_slots >= 4096,
9970        "undersized pooled suffix store must not be reused for larger blocks"
9971    );
9972}
9973
9974#[test]
9975fn source_hint_clamps_driver_slice_size_to_window() {
9976    let mut driver = MatchGeneratorDriver::new(128 * 1024, 2);
9977    driver.set_source_size_hint(1024);
9978    driver.reset(CompressionLevel::Default);
9979
9980    let window = driver.window_size() as usize;
9981    assert_eq!(window, 1 << MIN_HINTED_WINDOW_LOG);
9982    assert_eq!(driver.slice_size, window);
9983
9984    let space = driver.get_next_space();
9985    assert_eq!(space.len(), window);
9986    driver.commit_space(space);
9987}
9988
9989#[test]
9990fn pooled_space_keeps_capacity_when_slice_size_shrinks() {
9991    let mut driver = MatchGeneratorDriver::new(128 * 1024, 2);
9992    driver.reset(CompressionLevel::Default);
9993
9994    let large = driver.get_next_space();
9995    let large_capacity = large.capacity();
9996    assert!(large_capacity >= 128 * 1024);
9997    driver.commit_space(large);
9998
9999    driver.set_source_size_hint(1024);
10000    driver.reset(CompressionLevel::Default);
10001
10002    let small = driver.get_next_space();
10003    assert_eq!(small.len(), 1 << MIN_HINTED_WINDOW_LOG);
10004    assert!(
10005        small.capacity() >= large_capacity,
10006        "pooled buffer capacity should be preserved to avoid shrink/grow churn"
10007    );
10008}
10009
10010#[test]
10011fn driver_best_to_fastest_releases_oversized_hc_tables() {
10012    let mut driver = MatchGeneratorDriver::new(32, 2);
10013
10014    // Initialize at Best routed onto HashChain via the test-only override
10015    // (production `Best` sits on level 13, whose native backend differs) —
10016    // allocates large HC tables (4M hash, 2M chain) so the swap below
10017    // exercises the HC drain path this test pins.
10018    driver.reset_on_hc_lazy(CompressionLevel::Best);
10019    assert_eq!(driver.window_size(), (1u64 << 22));
10020
10021    // Feed data so tables are actually allocated via ensure_tables().
10022    let mut space = driver.get_next_space();
10023    space[..12].copy_from_slice(b"abcabcabcabc");
10024    space.truncate(12);
10025    driver.commit_space(space);
10026    driver.skip_matching_with_hint(None);
10027
10028    // Switch to Fastest — the [`MatcherStorage`] enum swaps to the
10029    // `Simple` variant and the `HashChain` variant is dropped. The
10030    // drain block in `Matcher::reset` reassigns
10031    // `m.table.hash_table` / `chain_table` / `hash3_table` to
10032    // `Vec::new()` BEFORE constructing the replacement variant so the
10033    // table backing allocations are released up front — this caps
10034    // peak memory during the swap to "old data buffers being drained
10035    // into `vec_pool` + new `MatchGenerator` skeleton" rather than
10036    // "old tables still resident + new variant under construction".
10037    // The eventual `Drop` on the old variant would release the tables
10038    // anyway, but only after the new variant is built, so the early
10039    // reassign shifts the peak. Post-switch the HC variant no longer
10040    // exists; the assertion that storage is now `Simple` covers the
10041    // invariant the old hash_table/chain_table checks were proxying.
10042    driver.reset(CompressionLevel::Fastest);
10043    assert_eq!(driver.window_size(), (1u64 << 19));
10044    assert_eq!(driver.active_backend(), super::strategy::BackendTag::Simple);
10045}
10046
10047#[test]
10048fn driver_better_to_best_resizes_hc_tables() {
10049    let mut driver = MatchGeneratorDriver::new(32, 2);
10050
10051    // The lazy band runs on the Row backend now, so the HC resize path is
10052    // exercised across two BT levels whose native `HcConfig` widths differ:
10053    // L13 (hash_log 22, chain_log 22) -> L15 (hash_log 23, chain_log 23).
10054    driver.reset(CompressionLevel::Level(13));
10055    assert_eq!(driver.window_size(), (1u64 << 22));
10056
10057    let mut space = driver.get_next_space();
10058    space[..12].copy_from_slice(b"abcabcabcabc");
10059    space.truncate(12);
10060    driver.commit_space(space);
10061    driver.skip_matching_with_hint(None);
10062
10063    let hc = driver.hc_matcher();
10064    let better_hash_len = hc.table.hash_table.len();
10065    let better_chain_len = hc.table.chain_table.len();
10066
10067    // Switch to L15 — must resize to larger tables.
10068    driver.reset(CompressionLevel::Level(15));
10069    assert_eq!(driver.window_size(), (1u64 << 22));
10070
10071    // Feed data to trigger ensure_tables with new sizes.
10072    let mut space = driver.get_next_space();
10073    space[..12].copy_from_slice(b"xyzxyzxyzxyz");
10074    space.truncate(12);
10075    driver.commit_space(space);
10076    driver.skip_matching_with_hint(None);
10077
10078    let hc = driver.hc_matcher();
10079    assert!(
10080        hc.table.hash_table.len() > better_hash_len,
10081        "L15 hash_table ({}) should be larger than L13 ({})",
10082        hc.table.hash_table.len(),
10083        better_hash_len
10084    );
10085    assert!(
10086        hc.table.chain_table.len() > better_chain_len,
10087        "L15 chain_table ({}) should be larger than L13 ({})",
10088        hc.table.chain_table.len(),
10089        better_chain_len
10090    );
10091}
10092
10093#[cfg(any())]
10094// disabled: tests legacy SuffixStore behavior incompatible with upstream zstd-shape kernel's HASH_READ_SIZE geometry
10095#[test]
10096fn prime_with_dictionary_preserves_history_for_first_full_block() {
10097    let mut driver = MatchGeneratorDriver::new(8, 1);
10098    driver.reset(CompressionLevel::Fastest);
10099
10100    driver.prime_with_dictionary(b"abcdefgh", [1, 4, 8]);
10101
10102    let mut space = driver.get_next_space();
10103    space.clear();
10104    space.extend_from_slice(b"abcdefgh");
10105    driver.commit_space(space);
10106
10107    let mut saw_match = false;
10108    driver.start_matching(|seq| {
10109        if let Sequence::Triple {
10110            literals,
10111            offset,
10112            match_len,
10113        } = seq
10114            && literals.is_empty()
10115            && offset == 8
10116            && match_len >= MIN_MATCH_LEN
10117        {
10118            saw_match = true;
10119        }
10120    });
10121
10122    assert!(
10123        saw_match,
10124        "first full block should still match dictionary-primed history"
10125    );
10126}
10127
10128#[cfg(any())]
10129// disabled: tests legacy SuffixStore behavior incompatible with upstream zstd-shape kernel's HASH_READ_SIZE geometry
10130#[test]
10131fn prime_with_large_dictionary_preserves_early_history_until_first_block() {
10132    let mut driver = MatchGeneratorDriver::new(8, 1);
10133    driver.reset(CompressionLevel::Fastest);
10134
10135    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
10136
10137    let mut space = driver.get_next_space();
10138    space.clear();
10139    space.extend_from_slice(b"abcdefgh");
10140    driver.commit_space(space);
10141
10142    let mut saw_match = false;
10143    driver.start_matching(|seq| {
10144        if let Sequence::Triple {
10145            literals,
10146            offset,
10147            match_len,
10148        } = seq
10149            && literals.is_empty()
10150            && offset == 24
10151            && match_len >= MIN_MATCH_LEN
10152        {
10153            saw_match = true;
10154        }
10155    });
10156
10157    assert!(
10158        saw_match,
10159        "dictionary bytes should remain addressable until frame output exceeds the live window"
10160    );
10161}
10162
10163#[test]
10164fn prime_with_dictionary_applies_offset_history_even_when_content_is_empty() {
10165    let mut driver = MatchGeneratorDriver::new(8, 1);
10166    driver.reset(CompressionLevel::Fastest);
10167
10168    driver.prime_with_dictionary(&[], [11, 7, 3]);
10169
10170    assert_eq!(driver.simple_mut().offset_hist, [11, 7, 3]);
10171}
10172
10173#[test]
10174fn hc_prime_with_empty_dictionary_disables_btultra2_seed_pass() {
10175    let mut driver = MatchGeneratorDriver::new(8, 1);
10176    driver.reset_on_hc_lazy(CompressionLevel::Better);
10177
10178    driver.prime_with_dictionary(&[], [11, 7, 3]);
10179
10180    assert_eq!(driver.hc_matcher().table.offset_hist, [11, 7, 3]);
10181    assert!(
10182        !driver
10183            .hc_matcher()
10184            .should_run_btultra2_seed_pass::<super::strategy::BtUltra2>(HC_PREDEF_THRESHOLD + 1),
10185        "btultra2 warmup must stay disabled after dictionary priming, even when dict content is empty"
10186    );
10187}
10188
10189#[test]
10190fn primed_snapshot_not_restored_across_ldm_config_change() {
10191    // The CDict-equivalent primed snapshot clones `storage`, which on the
10192    // BT backend carries `BtMatcher::ldm_producer`. A snapshot captured
10193    // under one LDM configuration must NOT be restored into a reset that
10194    // resolved a different LDM configuration (else the restored producer
10195    // is stale). `PrimedKey` must fold the LDM override into the key so
10196    // such a restore is refused and the caller re-primes.
10197    use super::parameters::CompressionParameters;
10198
10199    let dict = b"abcdefghabcdefghabcdefgh";
10200    let ldm_on = CompressionParameters::builder(CompressionLevel::Level(19))
10201        .enable_long_distance_matching(true)
10202        .build()
10203        .unwrap()
10204        .overrides();
10205    let ldm_off = CompressionParameters::builder(CompressionLevel::Level(19))
10206        .build()
10207        .unwrap()
10208        .overrides();
10209
10210    let mut driver = MatchGeneratorDriver::new(1024, 1);
10211
10212    // Capture a snapshot primed under LDM-on at level 19.
10213    driver.set_param_overrides(Some(ldm_on));
10214    driver.reset(CompressionLevel::Level(19));
10215    driver.prime_with_dictionary(dict, [1, 4, 8]);
10216    driver.capture_primed_dictionary(CompressionLevel::Level(19));
10217
10218    // Same dictionary + level, but LDM now OFF: the snapshot's LDM state
10219    // is stale, so restore must be refused.
10220    driver.set_param_overrides(Some(ldm_off));
10221    driver.reset(CompressionLevel::Level(19));
10222    assert!(
10223        !driver.restore_primed_dictionary(CompressionLevel::Level(19)),
10224        "primed snapshot restored across an LDM config change (stale producer)",
10225    );
10226
10227    // Sanity: re-priming + capturing under LDM-off, then restoring under
10228    // the IDENTICAL LDM-off config DOES match (the key is not over-tight).
10229    driver.prime_with_dictionary(dict, [1, 4, 8]);
10230    driver.capture_primed_dictionary(CompressionLevel::Level(19));
10231    driver.reset(CompressionLevel::Level(19));
10232    assert!(
10233        driver.restore_primed_dictionary(CompressionLevel::Level(19)),
10234        "primed snapshot not restored under identical LDM config",
10235    );
10236}
10237
10238#[test]
10239fn hc_prime_with_dictionary_disables_btultra2_seed_pass() {
10240    let mut driver = MatchGeneratorDriver::new(8, 1);
10241    driver.reset_on_hc_lazy(CompressionLevel::Better);
10242
10243    driver.prime_with_dictionary(b"abcdefgh", [1, 4, 8]);
10244
10245    assert!(
10246        !driver
10247            .hc_matcher()
10248            .should_run_btultra2_seed_pass::<super::strategy::BtUltra2>(HC_PREDEF_THRESHOLD + 1),
10249        "btultra2 warmup must stay disabled after dictionary priming with content"
10250    );
10251}
10252
10253#[test]
10254fn dfast_prime_with_dictionary_preserves_history_for_first_full_block() {
10255    let mut driver = MatchGeneratorDriver::new(8, 1);
10256    // Level(4) is Dfast with the greedy double-fast loop (upstream zstd parity:
10257    // clevels.h L3/L4 are both `ZSTD_dfast`, which has no lazy lookahead).
10258    // The fast loop needs at least `HASH_READ_SIZE` (8) bytes ahead of the
10259    // probe cursor, so this exercises a 16-byte dict + 16-byte block (the
10260    // whole block matches the dict, offset = dict length = 16).
10261    driver.reset(CompressionLevel::Level(4));
10262
10263    let payload = b"abcdefghijklmnop";
10264    driver.prime_with_dictionary(payload, [1, 4, 8]);
10265
10266    let mut space = driver.get_next_space();
10267    space.clear();
10268    space.extend_from_slice(payload);
10269    driver.commit_space(space);
10270
10271    let mut saw_match = false;
10272    driver.start_matching(|seq| {
10273        if let Sequence::Triple {
10274            literals,
10275            offset,
10276            match_len,
10277        } = seq
10278            && literals.is_empty()
10279            && offset == payload.len()
10280            && match_len >= DFAST_MIN_MATCH_LEN
10281        {
10282            saw_match = true;
10283        }
10284    });
10285
10286    assert!(
10287        saw_match,
10288        "dfast backend should match dictionary-primed history in first full block"
10289    );
10290}
10291
10292#[test]
10293fn prime_with_dictionary_does_not_inflate_reported_window_size() {
10294    let mut driver = MatchGeneratorDriver::new(8, 1);
10295    driver.reset(CompressionLevel::Fastest);
10296
10297    let before = driver.window_size();
10298    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
10299    let after = driver.window_size();
10300
10301    assert_eq!(
10302        after, before,
10303        "dictionary retention budget must not change reported frame window size"
10304    );
10305}
10306
10307#[test]
10308fn primed_snapshot_not_restored_when_window_hint_differs() {
10309    // The copy-snapshot must be keyed on the resolved reset parameters, not
10310    // just the CompressionLevel. `reset()` caps window_log by the source-size
10311    // hint, so two same-level frames with different hints resolve to different
10312    // windows. Restoring a snapshot captured at the larger hint into a reset
10313    // for the smaller hint would advertise the smaller window in the frame
10314    // header while the matcher's `max_window_size` (from the restored storage)
10315    // still spans the larger window — the encoder could then emit a match
10316    // (e.g. into the dictionary) past the advertised window, producing an
10317    // undecodable frame. Restore must REFUSE when the resolved window differs.
10318    let mut driver = MatchGeneratorDriver::new(8, 1);
10319    let level = CompressionLevel::Best;
10320
10321    // Frame A: large hint → larger resolved window. Prime + capture.
10322    driver.set_source_size_hint(256 * 1024);
10323    driver.reset(level);
10324    let big_window = driver.window_size();
10325    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
10326    driver.capture_primed_dictionary(level);
10327
10328    // Frame B: smaller hint, SAME level → smaller resolved window.
10329    driver.set_source_size_hint(48 * 1024);
10330    driver.reset(level);
10331    let small_window = driver.window_size();
10332    assert!(
10333        small_window < big_window,
10334        "precondition: the two hints must resolve to different windows \
10335         (small={small_window}, big={big_window})"
10336    );
10337
10338    let restored = driver.restore_primed_dictionary(level);
10339    assert!(
10340        !restored,
10341        "snapshot captured at window {big_window} must NOT be restored into a \
10342         reset advertising window {small_window} (level alone is an insufficient key)"
10343    );
10344}
10345
10346#[test]
10347fn primed_snapshot_restored_for_hints_in_same_window_bucket() {
10348    // The snapshot key must normalize the source-size hint to the resolved
10349    // matcher geometry, not the raw hinted byte count. `reset()` derives every
10350    // hint-dependent parameter (window_log cap, HC/Fast/Dfast/Row table widths,
10351    // the Fast attach-vs-copy cutoff) from `ceil_log2(hint)`, so two distinct
10352    // hints that share a ceil-log bucket resolve to the *identical* matcher
10353    // shape. Keying on the raw bytes over-keys: it forces a full re-prime on the
10354    // second frame even though the cached snapshot is a perfect fit. Restore
10355    // must SUCCEED across same-bucket hints.
10356    let mut driver = MatchGeneratorDriver::new(8, 1);
10357    let level = CompressionLevel::Best;
10358
10359    // Both hints fall in ceil_log2 bucket 19 (2^18 < n <= 2^19): 300 KiB and
10360    // 400 KiB resolve to the same window and table widths.
10361    driver.set_source_size_hint(300 * 1024);
10362    driver.reset(level);
10363    let window_a = driver.window_size();
10364    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
10365    driver.capture_primed_dictionary(level);
10366
10367    driver.set_source_size_hint(400 * 1024);
10368    driver.reset(level);
10369    let window_b = driver.window_size();
10370    assert_eq!(
10371        window_a, window_b,
10372        "precondition: same-bucket hints must resolve to the same window \
10373         (a={window_a}, b={window_b})"
10374    );
10375
10376    let restored = driver.restore_primed_dictionary(level);
10377    assert!(
10378        restored,
10379        "snapshot captured at a 300 KiB hint must be restored into a 400 KiB \
10380         hint that resolves to the identical matcher shape (raw bytes over-key)"
10381    );
10382}
10383
10384#[test]
10385fn primed_snapshot_restored_across_level22_tier_hints() {
10386    // Level 22 collapses several ceil-log buckets onto one upstream zstd source-size
10387    // tier: `resolve_level_params(Level(22), ..)` selects the HC config and
10388    // window_log by raw `<= 16 KiB / 128 KiB / 256 KiB` thresholds, so a 20 KiB
10389    // and a 100 KiB hint (ceil-log buckets 15 and 17) both land in the
10390    // `<= 128 KiB` tier and resolve to the IDENTICAL matcher (same window_log,
10391    // same HC hash/chain/search geometry). Keying on the raw ceil-log bucket
10392    // would still reject the restore here because the buckets differ; the key
10393    // must compare the resolved matcher shape so these share one snapshot.
10394    let mut driver = MatchGeneratorDriver::new(8, 1);
10395    let level = CompressionLevel::Level(22);
10396
10397    driver.set_source_size_hint(20 * 1024);
10398    driver.reset(level);
10399    let window_a = driver.window_size();
10400    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
10401    driver.capture_primed_dictionary(level);
10402
10403    driver.set_source_size_hint(100 * 1024);
10404    driver.reset(level);
10405    let window_b = driver.window_size();
10406    assert_eq!(
10407        window_a, window_b,
10408        "precondition: both hints must land in the same Level 22 upstream zstd tier \
10409         (a={window_a}, b={window_b})"
10410    );
10411
10412    let restored = driver.restore_primed_dictionary(level);
10413    assert!(
10414        restored,
10415        "Level 22 snapshot captured at a 20 KiB hint must be restored into a \
10416         100 KiB hint that resolves to the same upstream zstd tier (different ceil-log \
10417         buckets, identical matcher shape)"
10418    );
10419}
10420
10421#[test]
10422fn fast_dict_attaches_within_cutoff_bounds() {
10423    // Within the attach bounds, every Fast dict frame attaches (the copy-mode
10424    // owned path memmoved the whole input into history each frame; attach scans
10425    // the input in place via the borrowed dual-base kernel). All hints here sit
10426    // far below `FAST_ATTACH_DICT_CUTOFF_LOG` (2 GiB source) and the dict is far
10427    // below `MAX_FAST_ATTACH_DICT_REGION` (16 MiB), so a hint that used to cross
10428    // the old 8 KiB cutoff (8193 B) and a small one (8192 B) BOTH resolve to
10429    // attach, and the Simple backend reports a borrowed (in-place) dict scan for
10430    // both. This guards `FAST_ATTACH_DICT_CUTOFF_LOG` staying high enough that no
10431    // in-bounds Fast hint falls back to the input-copy path; the OUT-of-bounds
10432    // fallbacks are covered by `fast_attach_cutoff_keeps_virtual_positions_within_u32`
10433    // (source) and `oversized_dict_hint_routes_fast_to_copy_mode` (dict size).
10434    let level = CompressionLevel::Level(1);
10435    for hint in [8192u64, 8193, 1 << 20] {
10436        let mut driver = MatchGeneratorDriver::new(8, 1);
10437        driver.set_source_size_hint(hint);
10438        driver.reset(level);
10439        driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
10440        assert!(
10441            driver.borrowed_dict_supported(),
10442            "Fast dict frame with hint {hint} must attach (borrowed in-place \
10443             dict scan), never fall back to the copy-mode input-copy path"
10444        );
10445    }
10446}
10447
10448#[test]
10449fn fast_attach_cutoff_keeps_virtual_positions_within_u32() {
10450    // The cutoff is 31, NOT the full u64 source-size range, because the borrowed
10451    // dict kernel stores virtual positions as u32 (`cur_abs as u32`). The largest
10452    // attached source `1 << CUTOFF` (plus the dict prefix) must stay below
10453    // u32::MAX or that arithmetic wraps; the next bucket (4 GiB) would. This pins
10454    // the bound so a future "just raise it to attach everything" change cannot
10455    // silently reintroduce the overflow — raising the cutoff requires widening
10456    // the kernel's position type first.
10457    let max_attached: u64 = 1u64 << FAST_ATTACH_DICT_CUTOFF_LOG;
10458    assert!(
10459        max_attached <= u32::MAX as u64,
10460        "the largest attached source 2^{FAST_ATTACH_DICT_CUTOFF_LOG} must fit u32 \
10461         virtual positions",
10462    );
10463    assert!(
10464        (1u64 << (FAST_ATTACH_DICT_CUTOFF_LOG + 1)) > u32::MAX as u64,
10465        "the next bucket 2^{} would overflow u32 virtual positions",
10466        FAST_ATTACH_DICT_CUTOFF_LOG + 1,
10467    );
10468}
10469
10470#[test]
10471fn oversized_dict_hint_routes_fast_to_copy_mode() {
10472    // A dict whose region exceeds the tagged attach position field
10473    // (`MAX_FAST_ATTACH_DICT_REGION`, 16 MiB) must route the Fast prime to COPY
10474    // mode instead of the tagged attach fill, which would overflow the packed
10475    // position. The decision is keyed on the load-set size hint, so a hint past
10476    // the limit suffices to exercise it without allocating a real 16 MiB dict.
10477    // Copy mode leaves the borrowed in-place dict scan (attach-only) unavailable.
10478    let mut driver = MatchGeneratorDriver::new(8, 1);
10479    driver.set_dictionary_size_hint(MAX_FAST_ATTACH_DICT_REGION + 1);
10480    driver.reset(CompressionLevel::Level(1));
10481    driver.prime_with_dictionary(b"small dict content with some padding here", [1, 4, 8]);
10482    assert!(
10483        !driver.borrowed_dict_supported(),
10484        "an oversized dict must use copy mode, not the tagged attach fill"
10485    );
10486}
10487
10488#[test]
10489fn block_samples_match_dict_is_true_for_non_simple_backend() {
10490    // Production fallback: a non-Simple backend (here Row, Level 6) has no dict
10491    // probe, so the driver wrapper answers CONSERVATIVELY `true` for ANY block —
10492    // keeping the dict frame on the scan rather than letting the raw-fast-path
10493    // emit a block raw and miss an embedded dict segment (see
10494    // `dictionary_segment_in_incompressible_input_is_matched`). Only the
10495    // Simple/Fast backend trades the blanket scan for a precise probe.
10496    let dict = b"the quick brown fox jumps over the lazy dog 0123456789abcdef";
10497    let mut row = MatchGeneratorDriver::new(8, 6);
10498    row.set_dictionary_size_hint(dict.len());
10499    row.reset(CompressionLevel::Level(6));
10500    row.prime_with_dictionary(dict, [1, 4, 8]);
10501    assert!(
10502        row.block_samples_match_dict(&dict[..32]),
10503        "non-Simple backend must stay on the scan (true) for a dict frame"
10504    );
10505    let random: alloc::vec::Vec<u8> = (0..64u8)
10506        .map(|i| i.wrapping_mul(37).wrapping_add(13))
10507        .collect();
10508    assert!(
10509        row.block_samples_match_dict(&random),
10510        "non-Simple backend reports true regardless of block content"
10511    );
10512}
10513
10514#[test]
10515fn primed_snapshot_fast_attach_does_not_over_key_non_simple_backends() {
10516    // `fast_attach` is a Simple/Fast-backend concept (the 8 KiB attach-vs-copy
10517    // table split). Dfast/Row/HashChain each have their OWN attach/copy regime
10518    // (`DFAST_ATTACH_DICT_CUTOFF_LOG`, `ROW_ATTACH_DICT_CUTOFF_LOG`,
10519    // `HC_ATTACH_DICT_CUTOFF_LOG`) but those are deliberately kept OUT of the
10520    // `fast_attach` key, which only models the Fast table split. Their snapshots
10521    // are keyed by the resolved matcher geometry instead, and the HC modes share
10522    // one window geometry so an HC cross-mode restore stays decodable (see
10523    // `prime_with_dictionary`). Either way the `fast_attach`
10524    // bit must NOT enter a non-Simple snapshot key — otherwise an unhinted
10525    // capture (which would record `fast_attach = true`) and a hinted reset that
10526    // resolves to the IDENTICAL `LevelParams` would key differently and force a
10527    // needless re-prime. `Best` is a Row-backend lazy
10528    // level; this also pins the Row arm recording its RESOLVED hash width on
10529    // the unhinted path (a 0 default there keyed unhinted-vs-hinted apart).
10530    // An explicit Row-backend level: `Best` now sits on level 13 (Btlazy2),
10531    // so the named alias no longer reaches the Row arm this test pins.
10532    let mut driver = MatchGeneratorDriver::new(8, 1);
10533    let level = CompressionLevel::Level(12);
10534
10535    // Capture with no hint.
10536    driver.reset(level);
10537    let window_a = driver.window_size();
10538    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
10539    driver.capture_primed_dictionary(level);
10540
10541    // Reset with a hint large enough to resolve to the same window/params as
10542    // the unhinted level (>= 2^window_log, so the source-size cap is a no-op).
10543    driver.set_source_size_hint(64 * 1024 * 1024);
10544    driver.reset(level);
10545    let window_b = driver.window_size();
10546    assert_eq!(
10547        window_a, window_b,
10548        "precondition: the large hint must resolve to the same window as the \
10549         unhinted level (a={window_a}, b={window_b})"
10550    );
10551
10552    let restored = driver.restore_primed_dictionary(level);
10553    assert!(
10554        restored,
10555        "a Row snapshot must restore across an unhinted vs large-hinted \
10556         reset that resolves to the identical matcher — `fast_attach` is a Fast \
10557         backend concept and must not over-key non-Simple shapes"
10558    );
10559}
10560
10561#[cfg(any())] // disabled: tested SuffixStore-per-block tail-handling specific to legacy MatchGenerator
10562#[test]
10563fn prime_with_dictionary_does_not_reuse_tiny_suffix_store() {
10564    let mut driver = MatchGeneratorDriver::new(8, 2);
10565    driver.reset(CompressionLevel::Fastest);
10566
10567    // This dictionary leaves a 1-byte tail chunk (capacity=1 suffix table),
10568    // which should never be committed to the matcher window.
10569    driver.prime_with_dictionary(b"abcdefghi", [1, 4, 8]);
10570
10571    assert!(
10572        driver
10573            .simple()
10574            .window
10575            .iter()
10576            .all(|entry| entry.data.len() >= MIN_MATCH_LEN),
10577        "dictionary priming must not commit tails shorter than MIN_MATCH_LEN"
10578    );
10579}
10580
10581#[test]
10582fn prime_with_dictionary_counts_only_committed_tail_budget() {
10583    let mut driver = MatchGeneratorDriver::new(8, 1);
10584    driver.reset(CompressionLevel::Fastest);
10585
10586    let before = driver.simple_mut().max_window_size;
10587    // One full slice plus a 1-byte tail that cannot be committed.
10588    driver.prime_with_dictionary(b"abcdefghi", [1, 4, 8]);
10589
10590    assert_eq!(
10591        driver.simple_mut().max_window_size,
10592        before + 8,
10593        "retention budget must account only for dictionary bytes actually committed to history"
10594    );
10595}
10596
10597#[test]
10598fn dfast_prime_with_dictionary_counts_four_byte_tail_budget() {
10599    let mut driver = MatchGeneratorDriver::new(8, 1);
10600    driver.reset(CompressionLevel::Level(3));
10601
10602    let before = driver.dfast_matcher().max_window_size;
10603    // One full slice plus a 4-byte tail. Dfast can still use this tail through
10604    // short-hash overlap into the next block, so it should stay retained.
10605    driver.prime_with_dictionary(b"abcdefghijkl", [1, 4, 8]);
10606
10607    assert_eq!(
10608        driver.dfast_matcher().max_window_size,
10609        before + 12,
10610        "dfast retention budget should include 4-byte dictionary tails"
10611    );
10612}
10613
10614#[test]
10615fn row_prime_with_dictionary_preserves_history_for_first_full_block() {
10616    let mut driver = MatchGeneratorDriver::new(8, 1);
10617    // Level(5) is the greedy Row backend (LEVEL_TABLE row 5: Greedy / RowHash).
10618    // Level(4) now routes to Dfast, so this test must use Level(5) to actually
10619    // exercise `RowMatchGenerator`'s dictionary priming. The 16-byte dict +
10620    // 16-byte block lets the whole block match the primed dict (offset = dict
10621    // length = 16).
10622    driver.reset(CompressionLevel::Level(5));
10623
10624    let payload = b"abcdefghijklmnop";
10625    driver.prime_with_dictionary(payload, [1, 4, 8]);
10626
10627    let mut space = driver.get_next_space();
10628    space.clear();
10629    space.extend_from_slice(payload);
10630    driver.commit_space(space);
10631
10632    let mut saw_match = false;
10633    driver.start_matching(|seq| {
10634        if let Sequence::Triple {
10635            literals,
10636            offset,
10637            match_len,
10638        } = seq
10639            && literals.is_empty()
10640            && offset == payload.len()
10641            && match_len >= ROW_MIN_MATCH_LEN
10642        {
10643            saw_match = true;
10644        }
10645    });
10646
10647    assert!(
10648        saw_match,
10649        "row backend should match dictionary-primed history in first full block"
10650    );
10651}
10652
10653#[test]
10654fn row_prime_with_dictionary_subtracts_uncommitted_tail_budget() {
10655    let mut driver = MatchGeneratorDriver::new(8, 1);
10656    driver.reset(CompressionLevel::Level(5));
10657
10658    let base_window = driver.row_matcher().max_window_size;
10659    // Slice size is 8. The trailing byte cannot be committed (<4 tail),
10660    // so it must be subtracted from retained budget.
10661    driver.prime_with_dictionary(b"abcdefghi", [1, 4, 8]);
10662
10663    assert_eq!(
10664        driver.row_matcher().max_window_size,
10665        base_window + 8,
10666        "row retained window must exclude uncommitted 1-byte tail"
10667    );
10668}
10669
10670#[test]
10671fn prime_with_dictionary_budget_shrinks_after_row_eviction() {
10672    let mut driver = MatchGeneratorDriver::new(8, 1);
10673    driver.reset(CompressionLevel::Level(5));
10674    // Keep live window tiny so dictionary-primed slices are evicted quickly.
10675    driver.row_matcher_mut().max_window_size = 8;
10676    driver.reported_window_size = 8;
10677
10678    let base_window = driver.row_matcher().max_window_size;
10679    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
10680    assert_eq!(driver.row_matcher().max_window_size, base_window + 24);
10681
10682    for block in [b"AAAAAAAA", b"BBBBBBBB"] {
10683        let mut space = driver.get_next_space();
10684        space.clear();
10685        space.extend_from_slice(block);
10686        driver.commit_space(space);
10687        driver.skip_matching_with_hint(None);
10688    }
10689
10690    assert_eq!(
10691        driver.dictionary_retained_budget, 0,
10692        "dictionary budget should be fully retired once primed dict slices are evicted"
10693    );
10694    assert_eq!(
10695        driver.row_matcher().max_window_size,
10696        base_window,
10697        "retired dictionary budget must not remain reusable for live history"
10698    );
10699}
10700
10701/// Row → Simple transition drops the Row variant and the
10702/// post-switch active backend is exactly Simple. The window-emptied
10703/// check from the pre-enum era (`driver.row_matcher().window.is_empty()`)
10704/// is intentionally gone — the `Row` variant no longer exists after
10705/// the swap, so there is nothing to inspect by accessor; the "window
10706/// cleared" invariant is replaced by "variant dropped", and a
10707/// subsequent `row_matcher()` call would panic by design. The
10708/// pool-recycling side of the row backend is covered by
10709/// [`driver_row_commit_recycles_block_buffer_into_pool`].
10710#[test]
10711fn row_get_last_space_then_reset_to_fastest_drops_row_variant() {
10712    let mut driver = MatchGeneratorDriver::new(8, 1);
10713    driver.reset(CompressionLevel::Level(5));
10714    assert_eq!(driver.active_backend(), super::strategy::BackendTag::Row);
10715
10716    let mut space = driver.get_next_space();
10717    space.clear();
10718    space.extend_from_slice(b"row-data");
10719    driver.commit_space(space);
10720
10721    assert_eq!(driver.get_last_space(), b"row-data");
10722
10723    driver.reset(CompressionLevel::Fastest);
10724    assert_eq!(driver.active_backend(), super::strategy::BackendTag::Simple);
10725}
10726
10727/// Committing a Row block must return the input buffer to `vec_pool`
10728/// immediately (the bytes are mirrored into the contiguous `history`,
10729/// so there is no reason to retain a second copy in the window). This
10730/// guards the chunk-length window: the previous `VecDeque<Vec<u8>>`
10731/// window retained a full `block_capacity` buffer per committed block,
10732/// which on a heavily pre-split frame ballooned peak memory to many
10733/// times the live byte count. With the buffer recycled at commit time
10734/// the pool grows by exactly one Vec per committed block.
10735#[test]
10736fn driver_row_commit_recycles_block_buffer_into_pool() {
10737    let mut driver = MatchGeneratorDriver::new(8, 1);
10738    driver.reset(CompressionLevel::Level(5));
10739    assert_eq!(driver.active_backend(), super::strategy::BackendTag::Row);
10740
10741    let before_pool = driver.vec_pool.len();
10742    let mut space = driver.get_next_space();
10743    space.clear();
10744    space.extend_from_slice(b"row-data-to-recycle");
10745    driver.commit_space(space);
10746
10747    // `>` not `>=`: a fresh driver starts with `before_pool == 0`, so the
10748    // weaker bound passes even if the commit failed to recycle. Strict
10749    // growth proves the buffer was returned to the pool at commit time
10750    // rather than retained in the window (the pre-`chunk_lens` bug).
10751    assert!(
10752        driver.vec_pool.len() > before_pool,
10753        "row commit must recycle the committed block buffer into vec_pool \
10754         (before_pool = {before_pool}, after = {})",
10755        driver.vec_pool.len()
10756    );
10757    // The bytes still resolve through the contiguous history mirror.
10758    assert_eq!(driver.get_last_space(), b"row-data-to-recycle");
10759}
10760
10761#[test]
10762fn adjust_params_for_zero_source_size_uses_min_hinted_window_floor() {
10763    let mut params = resolve_level_params(CompressionLevel::Level(4), None);
10764    params.window_log = 22;
10765    let adjusted = adjust_params_for_source_size(params, 0);
10766    assert_eq!(adjusted.window_log, MIN_HINTED_WINDOW_LOG);
10767}
10768
10769#[test]
10770fn common_prefix_len_matches_scalar_reference_across_offsets() {
10771    fn scalar_reference(a: &[u8], b: &[u8]) -> usize {
10772        a.iter()
10773            .zip(b.iter())
10774            .take_while(|(lhs, rhs)| lhs == rhs)
10775            .count()
10776    }
10777
10778    for total_len in [
10779        0usize, 1, 5, 15, 16, 17, 31, 32, 33, 64, 65, 127, 191, 257, 320,
10780    ] {
10781        let base: Vec<u8> = (0..total_len)
10782            .map(|i| ((i * 13 + 7) & 0xFF) as u8)
10783            .collect();
10784
10785        for start in [0usize, 1, 3] {
10786            if start > total_len {
10787                continue;
10788            }
10789            let a = &base[start..];
10790            let b = a.to_vec();
10791            assert_eq!(
10792                common_prefix_len(a, &b),
10793                scalar_reference(a, &b),
10794                "equal slices total_len={total_len} start={start}"
10795            );
10796
10797            let len = a.len();
10798            for mismatch in [0usize, 1, 7, 15, 16, 31, 32, 47, 63, 95, 127, 128, 129, 191] {
10799                if mismatch >= len {
10800                    continue;
10801                }
10802                let mut altered = b.clone();
10803                altered[mismatch] ^= 0x5A;
10804                assert_eq!(
10805                    common_prefix_len(a, &altered),
10806                    scalar_reference(a, &altered),
10807                    "total_len={total_len} start={start} mismatch={mismatch}"
10808                );
10809            }
10810
10811            if len > 0 {
10812                let mismatch = len - 1;
10813                let mut altered = b.clone();
10814                altered[mismatch] ^= 0xA5;
10815                assert_eq!(
10816                    common_prefix_len(a, &altered),
10817                    scalar_reference(a, &altered),
10818                    "tail mismatch total_len={total_len} start={start} mismatch={mismatch}"
10819                );
10820            }
10821        }
10822    }
10823
10824    let long = alloc::vec![0xAB; 320];
10825    let shorter = alloc::vec![0xAB; 137];
10826    assert_eq!(
10827        common_prefix_len(&long, &shorter),
10828        scalar_reference(&long, &shorter)
10829    );
10830}
10831
10832#[test]
10833fn row_pick_lazy_returns_none_when_next_is_better() {
10834    let mut matcher = RowMatchGenerator::new(1 << 22);
10835    matcher.configure(ROW_CONFIG);
10836    matcher.add_data(alloc::vec![b'a'; 64], |_| {});
10837    matcher.ensure_tables();
10838
10839    let abs_pos = matcher.history_abs_start + 16;
10840    let best = MatchCandidate {
10841        start: abs_pos,
10842        offset: 8,
10843        match_len: ROW_MIN_MATCH_LEN,
10844    };
10845    assert!(
10846        matcher.pick_lazy_match(abs_pos, 0, Some(best)).is_none(),
10847        "lazy picker should defer when next position is clearly better"
10848    );
10849}
10850
10851#[test]
10852fn row_pick_lazy_depth2_returns_none_when_next2_significantly_better() {
10853    let mut matcher = RowMatchGenerator::new(1 << 22);
10854    matcher.configure(ROW_CONFIG);
10855    matcher.lazy_depth = 2;
10856    matcher.search_depth = 0;
10857    matcher.offset_hist = [6, 9, 1];
10858
10859    let mut data = alloc::vec![b'x'; 40];
10860    data[11..30].copy_from_slice(b"EFABCABCAEFABCAEFAB");
10861    matcher.add_data(data, |_| {});
10862    matcher.ensure_tables();
10863
10864    let abs_pos = matcher.history_abs_start + 20;
10865    let best = matcher
10866        .best_match(abs_pos, 0)
10867        .expect("expected baseline repcode match");
10868    assert_eq!(best.offset, 9);
10869    // Baseline match length is fixed by the fixture data (the offset-9
10870    // rep run is 6 bytes long), independent of the accept threshold.
10871    assert_eq!(best.match_len, 6);
10872
10873    if let Some(next) = matcher.best_match(abs_pos + 1, 1) {
10874        assert!(next.match_len <= best.match_len);
10875    }
10876
10877    let next2 = matcher
10878        .best_match(abs_pos + 2, 2)
10879        .expect("expected +2 candidate");
10880    assert!(
10881        next2.match_len > best.match_len + 1,
10882        "+2 candidate must be significantly better for depth-2 lazy skip"
10883    );
10884    assert!(
10885        matcher.pick_lazy_match(abs_pos, 0, Some(best)).is_none(),
10886        "lazy picker should defer when +2 candidate is significantly better"
10887    );
10888}
10889
10890#[test]
10891fn row_pick_lazy_depth2_keeps_best_when_next2_is_only_one_byte_better() {
10892    let mut matcher = RowMatchGenerator::new(1 << 22);
10893    matcher.configure(ROW_CONFIG);
10894    matcher.lazy_depth = 2;
10895    matcher.search_depth = 0;
10896    matcher.offset_hist = [6, 9, 1];
10897
10898    let mut data = alloc::vec![b'x'; 40];
10899    data[11..30].copy_from_slice(b"EFABCABCAEFABCAEFAZ");
10900    matcher.add_data(data, |_| {});
10901    matcher.ensure_tables();
10902
10903    let abs_pos = matcher.history_abs_start + 20;
10904    let best = matcher
10905        .best_match(abs_pos, 0)
10906        .expect("expected baseline repcode match");
10907    assert_eq!(best.offset, 9);
10908    // Baseline match length is fixed by the fixture data (the offset-9
10909    // rep run is 6 bytes long), independent of the accept threshold.
10910    assert_eq!(best.match_len, 6);
10911
10912    let next2 = matcher
10913        .best_match(abs_pos + 2, 2)
10914        .expect("expected +2 candidate");
10915    assert_eq!(next2.match_len, best.match_len + 1);
10916    let chosen = matcher
10917        .pick_lazy_match(abs_pos, 0, Some(best))
10918        .expect("lazy picker should keep current best");
10919    assert_eq!(chosen.start, best.start);
10920    assert_eq!(chosen.offset, best.offset);
10921    assert_eq!(chosen.match_len, best.match_len);
10922}
10923
10924/// Verifies row/tag extraction uses the shared hash mix bit-splitting contract.
10925#[test]
10926fn row_hash_and_row_extracts_high_bits() {
10927    let mut matcher = RowMatchGenerator::new(1 << 22);
10928    matcher.configure(ROW_CONFIG);
10929    matcher.add_data(
10930        alloc::vec![
10931            0xAA, 0xBB, 0xCC, 0x11, 0x10, 0x20, 0x30, 0x40, 0xAA, 0xBB, 0xCC, 0x22, 0x50, 0x60,
10932            0x70, 0x80,
10933        ],
10934        |_| {},
10935    );
10936    matcher.ensure_tables();
10937
10938    let pos = matcher.history_abs_start + 8;
10939    let (row, tag) = matcher
10940        .hash_and_row(pos)
10941        .expect("row hash should be available");
10942
10943    let idx = pos - matcher.history_abs_start;
10944    let concat = matcher.live_history();
10945    // Mirror `row_key_value`: an mls-wide masked key when 8 lookahead bytes
10946    // exist, the 4-byte key in the tail. `idx = 8` on a 16-byte history has
10947    // exactly 8 bytes left, so the wide arm applies here.
10948    let key_len = matcher.mls.min(6);
10949    let value = u64::from_le_bytes(concat[idx..idx + 8].try_into().unwrap())
10950        & ((1u64 << (key_len * 8)) - 1);
10951    let hash = crate::encoding::fastpath::hash_mix_u64_with_kernel(matcher.hash_kernel, value);
10952    let total_bits = matcher.row_hash_log + ROW_TAG_BITS;
10953    let combined = hash >> (u64::BITS as usize - total_bits);
10954    let expected_row =
10955        ((combined >> ROW_TAG_BITS) as usize) & ((1usize << matcher.row_hash_log) - 1);
10956    let expected_tag = combined as u8;
10957
10958    assert_eq!(row, expected_row);
10959    assert_eq!(tag, expected_tag);
10960}
10961
10962#[test]
10963fn row_repcode_skips_candidate_before_history_start() {
10964    let mut matcher = RowMatchGenerator::new(1 << 22);
10965    matcher.configure(ROW_CONFIG);
10966    matcher.history = alloc::vec![b'a'; 20];
10967    matcher.history_start = 0;
10968    matcher.history_abs_start = 10;
10969    matcher.offset_hist = [3, 0, 0];
10970
10971    assert!(matcher.repcode_candidate(12, 1).is_none());
10972}
10973
10974#[test]
10975fn row_repcode_returns_none_when_position_too_close_to_history_end() {
10976    let mut matcher = RowMatchGenerator::new(1 << 22);
10977    matcher.configure(ROW_CONFIG);
10978    matcher.history = b"abcde".to_vec();
10979    matcher.history_start = 0;
10980    matcher.history_abs_start = 0;
10981    matcher.offset_hist = [1, 0, 0];
10982
10983    assert!(matcher.repcode_candidate(4, 1).is_none());
10984}
10985
10986#[cfg(all(feature = "std", target_arch = "x86_64"))]
10987#[test]
10988fn hash_mix_sse42_path_is_available_and_matches_accelerated_impl_when_supported() {
10989    use crate::encoding::fastpath::{self, FastpathKernel};
10990    if !is_x86_feature_detected!("sse4.2") {
10991        return;
10992    }
10993    let v = 0x0123_4567_89AB_CDEFu64;
10994    // SAFETY: feature check above guarantees SSE4.2 is available.
10995    let accelerated = unsafe { fastpath::sse42::hash_mix_u64(v) };
10996    // Dispatcher must resolve to SSE4.2 (or better) and produce the same mix.
10997    let dispatched = fastpath::dispatch_hash_mix_u64(v);
10998    let kernel = fastpath::select_kernel();
10999    if kernel == FastpathKernel::Sse42 {
11000        assert_eq!(dispatched, accelerated);
11001    } else {
11002        // AVX2 kernel uses the same CRC32 instruction under the hood.
11003        assert_eq!(dispatched, accelerated, "AVX2/SSE4.2 share CRC32 mix");
11004    }
11005}
11006
11007#[cfg(all(feature = "std", target_arch = "aarch64", target_endian = "little"))]
11008#[test]
11009fn hash_mix_crc_path_is_available_and_matches_accelerated_impl_when_supported() {
11010    use crate::encoding::fastpath;
11011    if !is_aarch64_feature_detected!("crc") {
11012        return;
11013    }
11014    let v = 0x0123_4567_89AB_CDEFu64;
11015    // SAFETY: feature check above guarantees CRC32 is available.
11016    let accelerated = unsafe { fastpath::neon::hash_mix_u64(v) };
11017    let dispatched = fastpath::dispatch_hash_mix_u64(v);
11018    assert_eq!(dispatched, accelerated);
11019}
11020
11021#[test]
11022fn hc_hash3_position_matches_hash3_formula() {
11023    let bytes = [b'a', b'b', b'c', b'd'];
11024    let read32 = u32::from_le_bytes(bytes);
11025    let expected = (((read32 << 8).wrapping_mul(HC_PRIME3BYTES)) >> (32 - HC3_HASH_LOG)) as usize;
11026    assert_eq!(
11027        super::match_table::storage::MatchTable::hash3_position(&bytes, HC3_HASH_LOG),
11028        expected
11029    );
11030}
11031
11032#[test]
11033fn hc_hash_position_matches_hash4_formula() {
11034    let mut hc = HcMatchGenerator::new(1 << 20);
11035    hc.configure(HC_CONFIG, super::strategy::StrategyTag::Lazy, 22);
11036    let bytes = [b'a', b'b', b'c', b'd'];
11037    let read32 = u32::from_le_bytes(bytes);
11038    let expected = ((read32.wrapping_mul(HC_PRIME4BYTES)) >> (32 - hc.table.hash_log)) as usize;
11039    assert_eq!(hc.table.hash_position(&bytes), expected);
11040}
11041
11042#[test]
11043fn btultra2_main_hash_uses_hash4_formula() {
11044    let mut hc = HcMatchGenerator::new(1 << 20);
11045    hc.configure(
11046        BTULTRA2_HC_CONFIG_L22,
11047        super::strategy::StrategyTag::BtUltra2,
11048        27,
11049    );
11050    let bytes = [b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h'];
11051    let read32 = u32::from_le_bytes(bytes[..4].try_into().unwrap());
11052    let expected = ((read32.wrapping_mul(HC_PRIME4BYTES)) >> (32 - hc.table.hash_log)) as usize;
11053    let actual = super::match_table::storage::MatchTable::hash_position_with_mls(
11054        &bytes,
11055        hc.table.hash_log,
11056        super::bt::BtMatcher::HASH_MLS,
11057    );
11058    assert_eq!(actual, expected);
11059}
11060
11061#[test]
11062fn row_candidate_returns_none_when_abs_pos_near_end_of_history() {
11063    let mut matcher = RowMatchGenerator::new(1 << 22);
11064    matcher.configure(ROW_CONFIG);
11065    // One byte short of the accept floor: from abs_pos 0 there are fewer
11066    // than `ROW_MIN_MATCH_LEN` bytes left, so the length gate in
11067    // `row_candidate` must short-circuit to `None` before touching the
11068    // (here unbuilt) row tables.
11069    matcher.history = alloc::vec![b'a'; ROW_MIN_MATCH_LEN - 1];
11070    matcher.history_start = 0;
11071    matcher.history_abs_start = 0;
11072
11073    assert!(matcher.row_candidate(0, 0).is_none());
11074}
11075
11076#[test]
11077fn hc_chain_candidates_returns_sentinels_for_short_suffix() {
11078    let mut hc = HcMatchGenerator::new(32);
11079    hc.table.history = b"abc".to_vec();
11080    hc.table.history_start = 0;
11081    hc.table.history_abs_start = 0;
11082    hc.table.ensure_tables();
11083
11084    let candidates = hc.hc.chain_candidates(&hc.table, 0);
11085    assert!(candidates.iter().all(|&pos| pos == usize::MAX));
11086}
11087
11088#[test]
11089fn hc_reset_advances_floor_past_prior_frame_entries() {
11090    use super::match_table::storage::MatchTable;
11091    let mut hc = HcMatchGenerator::new(32);
11092    hc.table.add_data(b"abcdeabcde".to_vec(), |_| {});
11093    hc.table.ensure_tables();
11094    // Populate real hash / chain entries for the first frame's positions.
11095    hc.table.insert_positions(0, 6);
11096    let prev_end = hc.table.history_abs_end();
11097    assert_eq!(prev_end, 10);
11098    assert!(hc.table.hash_table.iter().any(|&v| v != HC_EMPTY));
11099
11100    hc.reset(|_| {});
11101
11102    // Behavioural contract: the previous frame's entries are no longer
11103    // matchable. `reset` advances the floor past every prior position
11104    // instead of zeroing the tables, so each populated slot now decodes
11105    // to an absolute position strictly below `history_abs_start` and is
11106    // rejected by the `window_low` guard before any byte is read.
11107    assert_eq!(hc.table.history_abs_start, prev_end);
11108    for &slot in hc.table.hash_table.iter() {
11109        if let Some(candidate_abs) =
11110            MatchTable::stored_abs_position_fast(slot, hc.table.position_base, hc.table.index_shift)
11111        {
11112            assert!(
11113                candidate_abs < hc.table.history_abs_start,
11114                "a prior-frame entry must resolve below the advanced floor"
11115            );
11116        }
11117    }
11118}
11119
11120#[test]
11121fn hc_reset_full_zeroes_when_floor_would_cross_ceiling() {
11122    use super::match_table::storage::REBASE_RESET_FLOOR_CEILING;
11123    let mut hc = HcMatchGenerator::new(32);
11124    hc.table.add_data(b"abcdeabcde".to_vec(), |_| {});
11125    hc.table.ensure_tables();
11126    hc.table.hash_table.fill(123);
11127    hc.table.chain_table.fill(456);
11128    // Push the would-be floor (`history_abs_end`) past the ceiling so
11129    // `reset` takes the bounded fallback: rewind to the origin and zero
11130    // the tables, keeping the absolute cursor from climbing toward
11131    // `usize::MAX` on 32-bit targets.
11132    hc.table.history_abs_start = REBASE_RESET_FLOOR_CEILING;
11133
11134    hc.reset(|_| {});
11135
11136    assert_eq!(hc.table.history_abs_start, 0);
11137    assert_eq!(hc.table.position_base, 0);
11138    assert!(hc.table.hash_table.iter().all(|&v| v == HC_EMPTY));
11139    assert!(hc.table.chain_table.iter().all(|&v| v == HC_EMPTY));
11140}
11141
11142#[test]
11143fn hc_start_matching_returns_early_for_empty_current_block() {
11144    let mut hc = HcMatchGenerator::new(32);
11145    hc.table.add_data(Vec::new(), |_| {});
11146    let mut called = false;
11147    hc.start_matching(|_| called = true);
11148    assert!(!called, "empty current block should not emit sequences");
11149}
11150
11151#[cfg(test)]
11152fn deterministic_high_entropy_bytes(seed: u64, len: usize) -> Vec<u8> {
11153    let mut out = Vec::with_capacity(len);
11154    let mut state = seed;
11155    for _ in 0..len {
11156        state ^= state << 13;
11157        state ^= state >> 7;
11158        state ^= state << 17;
11159        out.push((state >> 40) as u8);
11160    }
11161    out
11162}
11163
11164#[cfg(feature = "bench_internals")]
11165pub(crate) fn level22_block_ranges(data: &[u8]) -> Vec<(usize, usize)> {
11166    let mut ranges = Vec::new();
11167    let mut cursor = 0usize;
11168    let mut savings = 0i64;
11169    while cursor < data.len() {
11170        let remaining = data.len() - cursor;
11171        let candidate_len = remaining.min(super::cost_model::HC_BLOCKSIZE_MAX);
11172        let block_len = crate::encoding::frame_compressor::optimal_block_size(
11173            CompressionLevel::Level(22),
11174            &data[cursor..cursor + candidate_len],
11175            remaining,
11176            super::cost_model::HC_BLOCKSIZE_MAX,
11177            savings,
11178        )
11179        .min(candidate_len)
11180        .max(1);
11181        ranges.push((cursor, block_len));
11182        cursor += block_len;
11183        // The exact upstream zstd gate uses compressed-size savings. For this corpus
11184        // parity harness, after the first full block has compressed, savings is
11185        // sufficient to authorize the same pre-block splitter path.
11186        if cursor >= super::cost_model::HC_BLOCKSIZE_MAX {
11187            savings = 3;
11188        }
11189    }
11190    ranges
11191}
11192
11193#[cfg(feature = "bench_internals")]
11194fn merge_block_delimiters(sequences: Vec<(usize, usize, usize)>) -> Vec<(usize, usize, usize)> {
11195    let mut out = Vec::with_capacity(sequences.len());
11196    let mut pending_lits = 0usize;
11197    for (lit_len, offset, match_len) in sequences {
11198        if offset == 0 && match_len == 0 {
11199            pending_lits = pending_lits.saturating_add(lit_len);
11200            continue;
11201        }
11202        out.push((lit_len.saturating_add(pending_lits), offset, match_len));
11203        pending_lits = 0;
11204    }
11205    if pending_lits > 0 {
11206        out.push((pending_lits, 0, 0));
11207    }
11208    out
11209}
11210
11211/// White-box capture of the level-22 sequence stream (literal-length,
11212/// offset, match-length triples) the match generator emits for `data`,
11213/// with block-delimiter pseudo-sequences merged into the following
11214/// triple's literal run. Pure Rust; the C-conformance comparison that
11215/// consumes it lives in the `ffi-bench` crate.
11216#[cfg(feature = "bench_internals")]
11217pub(crate) fn collect_level22_sequences(data: &[u8]) -> Vec<(usize, usize, usize)> {
11218    merge_block_delimiters(collect_level22_sequences_with_delimiters(data))
11219        .into_iter()
11220        .filter(|(_, offset, match_len)| *offset != 0 || *match_len != 0)
11221        .collect()
11222}
11223
11224#[cfg(feature = "bench_internals")]
11225fn collect_level22_sequences_with_delimiters(data: &[u8]) -> Vec<(usize, usize, usize)> {
11226    let mut driver = MatchGeneratorDriver::new(super::cost_model::HC_BLOCKSIZE_MAX, 1);
11227    driver.set_source_size_hint(data.len() as u64);
11228    driver.reset(CompressionLevel::Level(22));
11229
11230    let mut sequences = Vec::new();
11231    for (chunk_start, chunk_len) in level22_block_ranges(data) {
11232        let chunk = &data[chunk_start..chunk_start + chunk_len];
11233        let mut space = driver.get_next_space();
11234        space[..chunk.len()].copy_from_slice(chunk);
11235        space.truncate(chunk.len());
11236        driver.commit_space(space);
11237        driver.start_matching(|seq| {
11238            let entry = match seq {
11239                Sequence::Literals { literals } => (literals.len(), 0usize, 0usize),
11240                Sequence::Triple {
11241                    literals,
11242                    offset,
11243                    match_len,
11244                } => (literals.len(), offset, match_len),
11245            };
11246            sequences.push(entry);
11247        });
11248    }
11249    sequences
11250}
11251
11252#[test]
11253fn hc_sparse_skip_matching_preserves_tail_cross_block_match() {
11254    let mut matcher = HcMatchGenerator::new(1 << 22);
11255    let tail = b"Qz9kLm2Rp";
11256    let mut first = deterministic_high_entropy_bytes(0xD1B5_4A32_9C77_0E19, 4096);
11257    let tail_start = first.len() - tail.len();
11258    first[tail_start..].copy_from_slice(tail);
11259    matcher.table.add_data(first.clone(), |_| {});
11260    matcher.skip_matching(Some(true));
11261
11262    let mut second = tail.to_vec();
11263    second.extend_from_slice(b"after-tail-literals");
11264    matcher.table.add_data(second, |_| {});
11265
11266    let mut first_sequence = None;
11267    matcher.start_matching(|seq| {
11268        if first_sequence.is_some() {
11269            return;
11270        }
11271        first_sequence = Some(match seq {
11272            Sequence::Literals { literals } => (literals.len(), 0usize, 0usize),
11273            Sequence::Triple {
11274                literals,
11275                offset,
11276                match_len,
11277            } => (literals.len(), offset, match_len),
11278        });
11279    });
11280
11281    let (literals_len, offset, match_len) =
11282        first_sequence.expect("expected at least one sequence after sparse skip");
11283    assert_eq!(
11284        literals_len, 0,
11285        "first sequence should start at block boundary"
11286    );
11287    assert_eq!(
11288        offset,
11289        tail.len(),
11290        "first match should reference previous tail"
11291    );
11292    assert!(
11293        match_len >= tail.len(),
11294        "tail-aligned cross-block match must be preserved"
11295    );
11296}
11297
11298#[test]
11299fn btultra2_sparse_skip_matching_preserves_tail_cross_block_match() {
11300    let mut matcher = HcMatchGenerator::new(1 << 20);
11301    matcher.configure(
11302        BTULTRA2_HC_CONFIG_L22,
11303        super::strategy::StrategyTag::BtUltra2,
11304        20,
11305    );
11306    let tail = b"Bt9kLm2Rp";
11307    let mut first = deterministic_high_entropy_bytes(0xA9C3_7F21_D4E8_510B, 4096);
11308    let tail_start = first.len() - tail.len();
11309    first[tail_start..].copy_from_slice(tail);
11310    matcher.table.add_data(first, |_| {});
11311    matcher.skip_matching(Some(true));
11312
11313    let mut second = tail.to_vec();
11314    second.extend_from_slice(b"after-tail-literals");
11315    matcher.table.add_data(second, |_| {});
11316
11317    let mut first_sequence = None;
11318    matcher.start_matching(|seq| {
11319        if first_sequence.is_some() {
11320            return;
11321        }
11322        first_sequence = Some(match seq {
11323            Sequence::Literals { literals } => (literals.len(), 0usize, 0usize),
11324            Sequence::Triple {
11325                literals,
11326                offset,
11327                match_len,
11328            } => (literals.len(), offset, match_len),
11329        });
11330    });
11331
11332    let (literals_len, offset, match_len) =
11333        first_sequence.expect("expected at least one sequence after sparse BT skip");
11334    assert_eq!(
11335        literals_len, 0,
11336        "BT sparse skip should preserve an immediate boundary match"
11337    );
11338    assert_eq!(
11339        offset,
11340        tail.len(),
11341        "first BT match should reference previous tail"
11342    );
11343    assert!(
11344        match_len >= tail.len(),
11345        "BT sparse skip must seed the dense tail for cross-block matching"
11346    );
11347}
11348
11349#[test]
11350fn hc_sparse_skip_matching_does_not_reinsert_sparse_tail_positions() {
11351    let mut matcher = HcMatchGenerator::new(1 << 22);
11352    let first = deterministic_high_entropy_bytes(0xC2B2_AE3D_27D4_EB4F, 4096);
11353    matcher.table.add_data(first.clone(), |_| {});
11354    matcher.skip_matching(Some(true));
11355
11356    let current_len = first.len();
11357    let current_abs_start =
11358        matcher.table.history_abs_start + matcher.table.window_size - current_len;
11359    let current_abs_end = current_abs_start + current_len;
11360    let dense_tail = HC_MIN_MATCH_LEN + INCOMPRESSIBLE_SKIP_STEP;
11361    let tail_start = current_abs_end
11362        .saturating_sub(dense_tail)
11363        .max(matcher.table.history_abs_start)
11364        .max(current_abs_start);
11365
11366    let overlap_pos = (tail_start..current_abs_end)
11367        .find(|&pos| (pos - current_abs_start).is_multiple_of(INCOMPRESSIBLE_SKIP_STEP))
11368        .expect("fixture should contain at least one sparse-grid overlap in dense tail");
11369
11370    let rel = matcher
11371        .table
11372        .relative_position(overlap_pos)
11373        .expect("overlap position should be representable as relative position");
11374    let chain_idx = rel as usize & ((1 << matcher.table.chain_log) - 1);
11375    assert_ne!(
11376        matcher.table.chain_table[chain_idx],
11377        rel + 1,
11378        "sparse-grid tail positions must not be reinserted (self-loop chain entry)"
11379    );
11380}
11381
11382#[test]
11383fn hc_compact_history_drains_when_threshold_crossed() {
11384    let mut hc = HcMatchGenerator::new(8);
11385    hc.table.history = b"abcdefghijklmnopqrstuvwxyz".to_vec();
11386    hc.table.history_start = 16;
11387    hc.table.compact_history();
11388    assert_eq!(hc.table.history_start, 0);
11389    assert_eq!(hc.table.history, b"qrstuvwxyz");
11390}
11391
11392#[test]
11393fn hc_insert_position_no_rebase_returns_when_relative_pos_unavailable() {
11394    let mut hc = HcMatchGenerator::new(32);
11395    hc.table.history = b"abcdefghijklmnop".to_vec();
11396    hc.table.history_abs_start = 0;
11397    hc.table.position_base = 1;
11398    hc.table.ensure_tables();
11399    let before_hash = hc.table.hash_table.clone();
11400    let before_chain = hc.table.chain_table.clone();
11401
11402    hc.table.insert_position_no_rebase(0);
11403
11404    assert_eq!(hc.table.hash_table, before_hash);
11405    assert_eq!(hc.table.chain_table, before_chain);
11406}
11407
11408#[test]
11409fn hc_insert_positions_advances_next_to_update3_for_contiguous_range() {
11410    let mut hc = HcMatchGenerator::new(64);
11411    hc.table.history = b"abcdefghijklmnopqrstuvwxyz".to_vec();
11412    hc.table.history_start = 0;
11413    hc.table.history_abs_start = 0;
11414    hc.table.position_base = 0;
11415    hc.table.ensure_tables();
11416    hc.table.next_to_update3 = 0;
11417
11418    hc.table.insert_positions(0, 9);
11419
11420    assert_eq!(
11421        hc.table.next_to_update3, 9,
11422        "contiguous insert_positions should advance hash3 update cursor"
11423    );
11424}
11425
11426#[test]
11427fn hc_insert_positions_with_step_keeps_next_to_update3_cursor_for_sparse_ranges() {
11428    let mut hc = HcMatchGenerator::new(64);
11429    hc.table.history = b"abcdefghijklmnopqrstuvwxyz".to_vec();
11430    hc.table.history_start = 0;
11431    hc.table.history_abs_start = 0;
11432    hc.table.position_base = 0;
11433    hc.table.ensure_tables();
11434    hc.table.next_to_update3 = 0;
11435
11436    hc.table.insert_positions_with_step(0, 16, 4);
11437
11438    assert_eq!(
11439        hc.table.next_to_update3, 0,
11440        "sparse insert_positions_with_step must not mark skipped positions as hash3-updated"
11441    );
11442}
11443
11444#[cfg(any())]
11445// disabled: tests legacy SuffixStore behavior incompatible with upstream zstd-shape kernel's HASH_READ_SIZE geometry
11446#[test]
11447fn prime_with_dictionary_budget_shrinks_after_simple_eviction() {
11448    let mut driver = MatchGeneratorDriver::new(8, 1);
11449    driver.reset(CompressionLevel::Fastest);
11450    // Use a small live window so dictionary-primed slices are evicted
11451    // quickly and budget retirement can be asserted deterministically.
11452    driver.simple_mut().max_window_size = 8;
11453    driver.reported_window_size = 8;
11454
11455    let base_window = driver.simple_mut().max_window_size;
11456    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
11457    assert_eq!(driver.simple_mut().max_window_size, base_window + 24);
11458
11459    for block in [b"AAAAAAAA", b"BBBBBBBB"] {
11460        let mut space = driver.get_next_space();
11461        space.clear();
11462        space.extend_from_slice(block);
11463        driver.commit_space(space);
11464        driver.skip_matching_with_hint(None);
11465    }
11466
11467    assert_eq!(
11468        driver.dictionary_retained_budget, 0,
11469        "dictionary budget should be fully retired once primed dict slices are evicted"
11470    );
11471    assert_eq!(
11472        driver.simple_mut().max_window_size,
11473        base_window,
11474        "retired dictionary budget must not remain reusable for live history"
11475    );
11476}
11477
11478#[test]
11479fn prime_with_dictionary_budget_shrinks_after_dfast_eviction() {
11480    let mut driver = MatchGeneratorDriver::new(8, 1);
11481    driver.reset(CompressionLevel::Level(3));
11482    // Use a small live window in this regression so dictionary-primed slices are
11483    // evicted quickly and budget retirement can be asserted deterministically.
11484    driver.dfast_matcher_mut().max_window_size = 8;
11485    driver.reported_window_size = 8;
11486
11487    let base_window = driver.dfast_matcher().max_window_size;
11488    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
11489    assert_eq!(driver.dfast_matcher().max_window_size, base_window + 24);
11490
11491    for block in [b"AAAAAAAA", b"BBBBBBBB"] {
11492        let mut space = driver.get_next_space();
11493        space.clear();
11494        space.extend_from_slice(block);
11495        driver.commit_space(space);
11496        driver.skip_matching_with_hint(None);
11497    }
11498
11499    assert_eq!(
11500        driver.dictionary_retained_budget, 0,
11501        "dictionary budget should be fully retired once primed dict slices are evicted"
11502    );
11503    assert_eq!(
11504        driver.dfast_matcher().max_window_size,
11505        base_window,
11506        "retired dictionary budget must not remain reusable for live history"
11507    );
11508}
11509
11510#[test]
11511fn hc_prime_with_dictionary_preserves_history_for_first_full_block() {
11512    let mut driver = MatchGeneratorDriver::new(8, 1);
11513    // Route onto HashChain explicitly — `Better` resolves to the Row
11514    // backend in production, and this test pins HC dict-prime behaviour.
11515    driver.reset_on_hc_lazy(CompressionLevel::Better);
11516
11517    driver.prime_with_dictionary(b"abcdefgh", [1, 4, 8]);
11518
11519    let mut space = driver.get_next_space();
11520    space.clear();
11521    // Repeat the dictionary content so the HC matcher can find it.
11522    // HC_MIN_MATCH_LEN is 5, so an 8-byte match is well above threshold.
11523    space.extend_from_slice(b"abcdefgh");
11524    driver.commit_space(space);
11525
11526    let mut saw_match = false;
11527    driver.start_matching(|seq| {
11528        if let Sequence::Triple {
11529            literals,
11530            offset,
11531            match_len,
11532        } = seq
11533            && literals.is_empty()
11534            && offset == 8
11535            && match_len >= HC_MIN_MATCH_LEN
11536        {
11537            saw_match = true;
11538        }
11539    });
11540
11541    assert!(
11542        saw_match,
11543        "hash-chain backend should match dictionary-primed history in first full block"
11544    );
11545}
11546
11547#[test]
11548fn prime_with_dictionary_budget_shrinks_after_hc_eviction() {
11549    let mut driver = MatchGeneratorDriver::new(8, 1);
11550    driver.reset_on_hc_lazy(CompressionLevel::Better);
11551    // Use a small live window so dictionary-primed slices are evicted quickly.
11552    driver.hc_matcher_mut().table.max_window_size = 8;
11553    driver.reported_window_size = 8;
11554
11555    let base_window = driver.hc_matcher().table.max_window_size;
11556    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
11557    assert_eq!(driver.hc_matcher().table.max_window_size, base_window + 24);
11558
11559    for block in [b"AAAAAAAA", b"BBBBBBBB"] {
11560        let mut space = driver.get_next_space();
11561        space.clear();
11562        space.extend_from_slice(block);
11563        driver.commit_space(space);
11564        driver.skip_matching_with_hint(None);
11565    }
11566
11567    assert_eq!(
11568        driver.dictionary_retained_budget, 0,
11569        "dictionary budget should be fully retired once primed dict slices are evicted"
11570    );
11571    assert_eq!(
11572        driver.hc_matcher().table.max_window_size,
11573        base_window,
11574        "retired dictionary budget must not remain reusable for live history"
11575    );
11576}
11577
11578#[test]
11579fn resident_reapply_restores_retained_dictionary_budget() {
11580    // A reused-dict frame that re-borrows the resident dictionary (skips the
11581    // re-prime) must restore the retained-dict budget the per-frame `reset`
11582    // cleared. The matcher's `reset` re-inflates `max_window_size` by the dict
11583    // region; without the restore the driver-level budget stays 0 and
11584    // `retire_dictionary_budget` never shrinks that inflated window as the dict
11585    // evicts. For the HashChain backend (whose `window_low` is measured against
11586    // `max_window_size`) that lets a post-eviction match exceed the frame
11587    // header's base window and emit an over-window offset.
11588    let mut driver = MatchGeneratorDriver::new(1 << 16, 1);
11589    let dict = b"abcdefghABCDEFGHijklmnopqrstuvwxyz0123456789";
11590    driver.set_dictionary_size_hint(dict.len());
11591    driver.reset_on_hc_lazy(CompressionLevel::Better);
11592    driver.prime_with_dictionary(dict, [1, 4, 8]);
11593    let base = driver.reported_window_size;
11594    assert!(
11595        driver.dictionary_retained_budget > 0,
11596        "the priming frame must retain a non-zero dict budget"
11597    );
11598
11599    // Second frame: the reset detects the resident dict and re-borrows it.
11600    driver.set_dictionary_size_hint(dict.len());
11601    driver.reset_on_hc_lazy(CompressionLevel::Better);
11602    assert!(
11603        driver.dictionary_is_resident(),
11604        "the second frame must re-borrow the resident dictionary"
11605    );
11606    assert_eq!(
11607        driver.dictionary_retained_budget, 0,
11608        "reset clears the retained-dict budget"
11609    );
11610    let inflated = driver.hc_matcher().table.max_window_size;
11611    assert!(
11612        inflated > base,
11613        "reset re-inflates the window by the resident dict region \
11614         (inflated={inflated}, base={base})"
11615    );
11616
11617    driver.reapply_resident_dictionary([1, 4, 8]);
11618    assert_eq!(
11619        driver.dictionary_retained_budget,
11620        inflated - base,
11621        "resident reapply must restore the retained-dict budget (= window \
11622         inflation) so the retire path can shrink the window as the dict evicts"
11623    );
11624}
11625
11626#[test]
11627fn hc_commit_without_eviction_retires_no_dictionary_budget() {
11628    // Regression: after the window<->history dedup, MatchTable::add_data
11629    // invokes its reuse_space callback for the *input* buffer (recycle),
11630    // not for evicted chunks. The HC arm of commit_space must therefore
11631    // derive eviction bytes from the window_size delta — counting the
11632    // callback argument as evicted would charge the whole committed block
11633    // as "evicted" and prematurely retire dictionary budget even when the
11634    // window is nowhere near full.
11635    let mut driver = MatchGeneratorDriver::new(8, 1);
11636    driver.reset_on_hc_lazy(CompressionLevel::Better);
11637    // A large live window so a small committed block evicts nothing.
11638    driver.hc_matcher_mut().table.max_window_size = 1 << 20;
11639    driver.reported_window_size = 1 << 20;
11640    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
11641    let budget_after_prime = driver.dictionary_retained_budget;
11642    assert!(
11643        budget_after_prime > 0,
11644        "priming must retain a non-zero dictionary budget"
11645    );
11646
11647    let mut space = driver.get_next_space();
11648    space.clear();
11649    space.extend_from_slice(b"AAAAAAAA");
11650    driver.commit_space(space);
11651    driver.skip_matching_with_hint(None);
11652
11653    assert_eq!(
11654        driver.dictionary_retained_budget, budget_after_prime,
11655        "a commit that evicts nothing must retire no dictionary budget"
11656    );
11657}
11658
11659#[test]
11660fn row_commit_without_eviction_retires_no_dictionary_budget() {
11661    // Regression for the Row arm of commit_space after the window ->
11662    // chunk_lens migration: RowMatchGenerator::add_data now invokes its
11663    // reuse_space callback for the *input* buffer (per-commit recycle),
11664    // not for evicted chunks. The Row arm must derive eviction bytes from
11665    // the window_size delta like the Dfast / HashChain arms — counting the
11666    // callback argument as evicted charges the whole committed block as
11667    // "evicted" and prematurely retires dictionary budget even when the
11668    // window is nowhere near full.
11669    let mut driver = MatchGeneratorDriver::new(8, 1);
11670    driver.reset(CompressionLevel::Level(5));
11671    assert!(matches!(driver.storage, MatcherStorage::Row(_)));
11672    // A large live window so a small committed block evicts nothing.
11673    driver.row_matcher_mut().max_window_size = 1 << 20;
11674    driver.reported_window_size = 1 << 20;
11675    driver.prime_with_dictionary(b"abcdefghABCDEFGHijklmnop", [1, 4, 8]);
11676    let budget_after_prime = driver.dictionary_retained_budget;
11677    assert!(
11678        budget_after_prime > 0,
11679        "priming must retain a non-zero dictionary budget"
11680    );
11681
11682    let mut space = driver.get_next_space();
11683    space.clear();
11684    space.extend_from_slice(b"AAAAAAAA");
11685    driver.commit_space(space);
11686    driver.skip_matching_with_hint(None);
11687
11688    assert_eq!(
11689        driver.dictionary_retained_budget, budget_after_prime,
11690        "a Row commit that evicts nothing must retire no dictionary budget"
11691    );
11692}
11693
11694#[test]
11695fn hc_rebases_positions_after_u32_boundary() {
11696    let mut matcher = HcMatchGenerator::new(64);
11697    matcher.table.add_data(b"abcdeabcdeabcde".to_vec(), |_| {});
11698    matcher.table.ensure_tables();
11699    matcher.table.position_base = 0;
11700    let history_abs_start: usize = match (u64::from(u32::MAX) + 64).try_into() {
11701        Ok(value) => value,
11702        Err(_) => return,
11703    };
11704    // Simulate a long-running stream where absolute history positions crossed
11705    // the u32 range. Before #51 this disabled HC inserts entirely.
11706    matcher.table.history_abs_start = history_abs_start;
11707    matcher.skip_matching(None);
11708    assert_eq!(
11709        matcher.table.position_base, matcher.table.history_abs_start,
11710        "rebase should anchor to the oldest live absolute position"
11711    );
11712
11713    assert!(
11714        matcher
11715            .table
11716            .hash_table
11717            .iter()
11718            .any(|entry| *entry != HC_EMPTY),
11719        "HC hash table should still be populated after crossing u32 boundary"
11720    );
11721
11722    // Verify rebasing preserves candidate lookup, not just table population.
11723    let abs_pos = matcher.table.history_abs_start + 10;
11724    let candidates = matcher.hc.chain_candidates(&matcher.table, abs_pos);
11725    assert!(
11726        candidates.iter().any(|candidate| *candidate != usize::MAX),
11727        "chain_candidates should return valid matches after rebase"
11728    );
11729}
11730
11731// 64-bit only: the >4 GiB absolute cursor this test fabricates cannot exist on
11732// a 32-bit target (usize == u32 can't address that much), and setting
11733// `history_abs_start` near `u32::MAX` there overflows `usize` in the
11734// `check_stream_abs_headroom` guard before the rebase path is reached. Mirrors
11735// the `try_into()` early-return guard on `hc_rebases_positions_after_u32_boundary`.
11736#[cfg(target_pointer_width = "64")]
11737#[test]
11738fn row_rebases_positions_after_u32_boundary() {
11739    // Row stores absolute match positions as u32. On a long stream the
11740    // cumulative absolute cursor crosses the u32 range even while the live
11741    // window stays bounded; `add_data` must rebase the coordinate origin
11742    // down to the oldest live byte instead of asserting. Before the rebase
11743    // landed this panicked on the `< u32::MAX` assertion, dropping valid
11744    // long Row-backed frames.
11745    let mut m = RowMatchGenerator::new(64);
11746    m.add_data(b"abcdeabcdeabcde".to_vec(), |_| {});
11747
11748    // Simulate ~4 GiB of stream behind a bounded window: the live bytes now
11749    // sit just under the u32 absolute ceiling.
11750    let near_ceiling = (u32::MAX as usize) - 16;
11751    m.history_abs_start = near_ceiling;
11752
11753    // The next commit would push a u32 position past the ceiling; add_data
11754    // must rebase the origin rather than panic.
11755    m.add_data(b"fghij".to_vec(), |_| {});
11756
11757    assert!(
11758        m.history_abs_start < near_ceiling,
11759        "add_data must rebase the absolute origin down when the cursor nears \
11760         u32::MAX (got {})",
11761        m.history_abs_start
11762    );
11763    assert!(
11764        (m.history_abs_start + m.window_size) < u32::MAX as usize,
11765        "after rebase the live window must fit below the u32 position ceiling"
11766    );
11767}
11768
11769#[test]
11770fn hc_rebase_rebuilds_only_inserted_prefix() {
11771    let mut matcher = HcMatchGenerator::new(64);
11772    matcher.table.add_data(b"abcdeabcdeabcde".to_vec(), |_| {});
11773    matcher.table.ensure_tables();
11774    matcher.table.position_base = 0;
11775    let history_abs_start: usize = match (u64::from(u32::MAX) + 64).try_into() {
11776        Ok(value) => value,
11777        Err(_) => return,
11778    };
11779    matcher.table.history_abs_start = history_abs_start;
11780    let abs_pos = matcher.table.history_abs_start + 6;
11781
11782    let mut expected = HcMatchGenerator::new(64);
11783    expected.table.add_data(b"abcdeabcdeabcde".to_vec(), |_| {});
11784    expected.table.ensure_tables();
11785    expected.table.history_abs_start = history_abs_start;
11786    expected.table.position_base = expected.table.history_abs_start;
11787    expected.table.hash_table.fill(HC_EMPTY);
11788    expected.table.chain_table.fill(HC_EMPTY);
11789    for pos in expected.table.history_abs_start..abs_pos {
11790        expected.table.insert_position_no_rebase(pos);
11791    }
11792
11793    matcher.table.maybe_rebase_positions(abs_pos);
11794
11795    assert_eq!(
11796        matcher.table.position_base, matcher.table.history_abs_start,
11797        "rebase should still anchor to the oldest live absolute position"
11798    );
11799    assert_eq!(
11800        matcher.table.hash_table, expected.table.hash_table,
11801        "rebase must rebuild only positions already inserted before abs_pos"
11802    );
11803    assert_eq!(
11804        matcher.table.chain_table, expected.table.chain_table,
11805        "future positions must not be pre-seeded into HC chains during rebase"
11806    );
11807}
11808
11809#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
11810#[test]
11811fn suffix_store_with_single_slot_does_not_panic_on_keying() {
11812    let mut suffixes = SuffixStore::with_capacity(1);
11813    suffixes.insert(b"abcde", 0);
11814    assert!(suffixes.contains_key(b"abcde"));
11815    assert_eq!(suffixes.get(b"abcde"), Some(0));
11816}
11817
11818#[cfg(any())]
11819// disabled: hash_fill_step is a legacy MatchGenerator field; FastKernelMatcher walks stride=1 today
11820#[test]
11821fn fastest_reset_uses_interleaved_hash_fill_step() {
11822    let mut driver = MatchGeneratorDriver::new(32, 2);
11823
11824    driver.reset(CompressionLevel::Uncompressed);
11825    assert_eq!(driver.simple().hash_fill_step, 1);
11826
11827    driver.reset(CompressionLevel::Fastest);
11828    assert_eq!(driver.simple().hash_fill_step, FAST_HASH_FILL_STEP);
11829
11830    // Better uses the HashChain backend with lazy2; verify that the backend switch
11831    // happened and the lazy_depth is configured correctly.
11832    driver.reset(CompressionLevel::Better);
11833    assert_eq!(
11834        driver.active_backend(),
11835        super::strategy::BackendTag::HashChain
11836    );
11837    assert_eq!(driver.window_size(), (1u64 << 23));
11838    assert_eq!(driver.hc_matcher().hc.lazy_depth, 2);
11839}
11840
11841#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
11842#[test]
11843fn simple_matcher_updates_offset_history_after_emitting_match() {
11844    let mut matcher = MatchGenerator::new(64);
11845    matcher.add_data(
11846        b"abcdeabcdeabcde".to_vec(),
11847        SuffixStore::with_capacity(64),
11848        |_, _| {},
11849    );
11850
11851    assert!(matcher.next_sequence(|seq| {
11852        assert_eq!(
11853            seq,
11854            Sequence::Triple {
11855                literals: b"abcde",
11856                offset: 5,
11857                match_len: 10,
11858            }
11859        );
11860    }));
11861    assert_eq!(matcher.offset_hist, [5, 1, 4]);
11862}
11863
11864#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
11865#[test]
11866fn simple_matcher_zero_literal_repcode_checks_rep1_before_hash_lookup() {
11867    let mut matcher = MatchGenerator::new(64);
11868    matcher.add_data(
11869        b"abcdefghijabcdefghij".to_vec(),
11870        SuffixStore::with_capacity(64),
11871        |_, _| {},
11872    );
11873
11874    matcher.suffix_idx = 10;
11875    matcher.last_idx_in_sequence = 10;
11876    matcher.offset_hist = [99, 10, 4];
11877
11878    let candidate = matcher.repcode_candidate(&matcher.window.last().unwrap().data[10..], 0);
11879    assert_eq!(candidate, Some((10, 10)));
11880}
11881
11882#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
11883#[test]
11884fn simple_matcher_repcode_can_target_previous_window_entry() {
11885    let mut matcher = MatchGenerator::new(64);
11886    matcher.add_data(
11887        b"abcdefghij".to_vec(),
11888        SuffixStore::with_capacity(64),
11889        |_, _| {},
11890    );
11891    matcher.skip_matching();
11892    matcher.add_data(
11893        b"abcdefghij".to_vec(),
11894        SuffixStore::with_capacity(64),
11895        |_, _| {},
11896    );
11897
11898    matcher.offset_hist = [99, 10, 4];
11899
11900    let candidate = matcher.repcode_candidate(&matcher.window.last().unwrap().data, 0);
11901    assert_eq!(candidate, Some((10, 10)));
11902}
11903
11904#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
11905#[test]
11906fn simple_matcher_zero_literal_repcode_checks_rep2() {
11907    let mut matcher = MatchGenerator::new(64);
11908    matcher.add_data(
11909        b"abcdefghijabcdefghij".to_vec(),
11910        SuffixStore::with_capacity(64),
11911        |_, _| {},
11912    );
11913    matcher.suffix_idx = 10;
11914    matcher.last_idx_in_sequence = 10;
11915    // rep1=4 does not match at idx 10, rep2=10 does.
11916    matcher.offset_hist = [99, 4, 10];
11917
11918    let candidate = matcher.repcode_candidate(&matcher.window.last().unwrap().data[10..], 0);
11919    assert_eq!(candidate, Some((10, 10)));
11920}
11921
11922#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
11923#[test]
11924fn simple_matcher_zero_literal_repcode_checks_rep0_minus1() {
11925    let mut matcher = MatchGenerator::new(64);
11926    matcher.add_data(
11927        b"abcdefghijabcdefghij".to_vec(),
11928        SuffixStore::with_capacity(64),
11929        |_, _| {},
11930    );
11931    matcher.suffix_idx = 10;
11932    matcher.last_idx_in_sequence = 10;
11933    // rep1=4 and rep2=99 do not match; rep0-1 == 10 does.
11934    matcher.offset_hist = [11, 4, 99];
11935
11936    let candidate = matcher.repcode_candidate(&matcher.window.last().unwrap().data[10..], 0);
11937    assert_eq!(candidate, Some((10, 10)));
11938}
11939
11940#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
11941#[test]
11942fn simple_matcher_repcode_rejects_offsets_beyond_searchable_prefix() {
11943    let mut matcher = MatchGenerator::new(64);
11944    matcher.add_data(
11945        b"abcdefghij".to_vec(),
11946        SuffixStore::with_capacity(64),
11947        |_, _| {},
11948    );
11949    matcher.skip_matching();
11950    matcher.add_data(
11951        b"klmnopqrst".to_vec(),
11952        SuffixStore::with_capacity(64),
11953        |_, _| {},
11954    );
11955    matcher.suffix_idx = 3;
11956
11957    let candidate = matcher.offset_match_len(14, &matcher.window.last().unwrap().data[3..]);
11958    assert_eq!(candidate, None);
11959}
11960
11961#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
11962#[test]
11963fn simple_matcher_skip_matching_seeds_every_position_even_with_fast_step() {
11964    let mut matcher = MatchGenerator::new(64);
11965    matcher.hash_fill_step = FAST_HASH_FILL_STEP;
11966    matcher.add_data(
11967        b"abcdefghijklmnop".to_vec(),
11968        SuffixStore::with_capacity(64),
11969        |_, _| {},
11970    );
11971    matcher.skip_matching();
11972    matcher.add_data(b"bcdef".to_vec(), SuffixStore::with_capacity(64), |_, _| {});
11973
11974    assert!(matcher.next_sequence(|seq| {
11975        assert_eq!(
11976            seq,
11977            Sequence::Triple {
11978                literals: b"",
11979                offset: 15,
11980                match_len: 5,
11981            }
11982        );
11983    }));
11984    assert!(!matcher.next_sequence(|_| {}));
11985}
11986
11987#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
11988#[test]
11989fn simple_matcher_skip_matching_with_incompressible_hint_uses_sparse_prefix() {
11990    let mut matcher = MatchGenerator::new(128);
11991    let first = b"abcdefghijklmnopqrstuvwxyz012345".to_vec();
11992    let sparse_probe = first[3..3 + MIN_MATCH_LEN].to_vec();
11993    let tail_start = first.len() - MIN_MATCH_LEN;
11994    let tail_probe = first[tail_start..tail_start + MIN_MATCH_LEN].to_vec();
11995    matcher.add_data(first, SuffixStore::with_capacity(256), |_, _| {});
11996
11997    matcher.skip_matching_with_hint(Some(true));
11998
11999    // Observable behavior check: sparse-prefix probe should not immediately match.
12000    matcher.add_data(sparse_probe, SuffixStore::with_capacity(256), |_, _| {});
12001    let mut sparse_first_is_literals = None;
12002    assert!(matcher.next_sequence(|seq| {
12003        if sparse_first_is_literals.is_none() {
12004            sparse_first_is_literals = Some(matches!(seq, Sequence::Literals { .. }));
12005        }
12006    }));
12007    assert!(
12008        sparse_first_is_literals.unwrap_or(false),
12009        "sparse-start probe should not produce an immediate match"
12010    );
12011
12012    // Dense tail remains indexed for cross-block boundary matching.
12013    let mut matcher = MatchGenerator::new(128);
12014    matcher.add_data(
12015        b"abcdefghijklmnopqrstuvwxyz012345".to_vec(),
12016        SuffixStore::with_capacity(256),
12017        |_, _| {},
12018    );
12019    matcher.skip_matching_with_hint(Some(true));
12020    matcher.add_data(tail_probe, SuffixStore::with_capacity(256), |_, _| {});
12021    let mut tail_first_is_immediate_match = None;
12022    assert!(matcher.next_sequence(|seq| {
12023        if tail_first_is_immediate_match.is_none() {
12024            tail_first_is_immediate_match =
12025                Some(matches!(seq, Sequence::Triple { literals, .. } if literals.is_empty()));
12026        }
12027    }));
12028    assert!(
12029        tail_first_is_immediate_match.unwrap_or(false),
12030        "dense tail probe should match immediately at block start"
12031    );
12032}
12033
12034#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
12035#[test]
12036fn simple_matcher_add_suffixes_till_backfills_last_searchable_anchor() {
12037    let mut matcher = MatchGenerator::new(64);
12038    matcher.hash_fill_step = FAST_HASH_FILL_STEP;
12039    matcher.add_data(
12040        b"01234abcde".to_vec(),
12041        SuffixStore::with_capacity(64),
12042        |_, _| {},
12043    );
12044    matcher.add_suffixes_till(10, FAST_HASH_FILL_STEP);
12045
12046    let last = matcher.window.last().unwrap();
12047    let tail = &last.data[5..10];
12048    assert_eq!(last.suffixes.get(tail), Some(5));
12049}
12050
12051#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
12052#[test]
12053fn simple_matcher_add_suffixes_till_skips_when_idx_below_min_match_len() {
12054    let mut matcher = MatchGenerator::new(128);
12055    matcher.hash_fill_step = FAST_HASH_FILL_STEP;
12056    matcher.add_data(
12057        b"abcdefghijklmnopqrstuvwxyz".to_vec(),
12058        SuffixStore::with_capacity(1 << 16),
12059        |_, _| {},
12060    );
12061
12062    matcher.add_suffixes_till(MIN_MATCH_LEN - 1, FAST_HASH_FILL_STEP);
12063
12064    let last = matcher.window.last().unwrap();
12065    let first_key = &last.data[..MIN_MATCH_LEN];
12066    assert_eq!(last.suffixes.get(first_key), None);
12067}
12068
12069#[cfg(any())] // disabled: tested legacy MatchGenerator/SuffixStore behavior removed in phase 1b
12070#[test]
12071fn simple_matcher_add_suffixes_till_fast_step_registers_interleaved_positions() {
12072    let mut matcher = MatchGenerator::new(128);
12073    matcher.hash_fill_step = FAST_HASH_FILL_STEP;
12074    matcher.add_data(
12075        b"abcdefghijklmnopqrstuvwxyz".to_vec(),
12076        SuffixStore::with_capacity(1 << 16),
12077        |_, _| {},
12078    );
12079
12080    matcher.add_suffixes_till(17, FAST_HASH_FILL_STEP);
12081
12082    let last = matcher.window.last().unwrap();
12083    for pos in [0usize, 3, 6, 9, 12] {
12084        let key = &last.data[pos..pos + MIN_MATCH_LEN];
12085        assert_eq!(
12086            last.suffixes.get(key),
12087            Some(pos),
12088            "expected interleaved suffix registration at pos {pos}"
12089        );
12090    }
12091}
12092
12093#[test]
12094fn dfast_skip_matching_handles_window_eviction() {
12095    let mut matcher = DfastMatchGenerator::new(16);
12096
12097    matcher.add_data(alloc::vec![1, 2, 3, 4, 5, 6], |_| {});
12098    matcher.skip_matching(None);
12099    matcher.add_data(alloc::vec![7, 8, 9, 10, 11, 12], |_| {});
12100    matcher.skip_matching(None);
12101    matcher.add_data(alloc::vec![7, 8, 9, 10, 11, 12], |_| {});
12102
12103    let mut reconstructed = alloc::vec![7, 8, 9, 10, 11, 12];
12104    matcher.start_matching(|seq| match seq {
12105        Sequence::Literals { literals } => reconstructed.extend_from_slice(literals),
12106        Sequence::Triple {
12107            literals,
12108            offset,
12109            match_len,
12110        } => {
12111            reconstructed.extend_from_slice(literals);
12112            let start = reconstructed.len() - offset;
12113            for i in 0..match_len {
12114                let byte = reconstructed[start + i];
12115                reconstructed.push(byte);
12116            }
12117        }
12118    });
12119
12120    assert_eq!(reconstructed, [7, 8, 9, 10, 11, 12, 7, 8, 9, 10, 11, 12]);
12121}
12122
12123#[test]
12124fn dfast_add_data_callback_reports_evicted_len_not_capacity() {
12125    let mut matcher = DfastMatchGenerator::new(8);
12126
12127    let mut first = Vec::with_capacity(64);
12128    first.extend_from_slice(b"abcdefgh");
12129    matcher.add_data(first, |_| {});
12130
12131    let mut second = Vec::with_capacity(64);
12132    second.extend_from_slice(b"ijklmnop");
12133
12134    let mut observed_evicted_len = None;
12135    matcher.add_data(second, |data| {
12136        observed_evicted_len = Some(data.len());
12137    });
12138
12139    assert_eq!(
12140        observed_evicted_len,
12141        Some(8),
12142        "eviction callback must report evicted byte length, not backing capacity"
12143    );
12144}
12145
12146/// Regression for the `commit_space` Dfast-branch eviction accounting bug
12147/// (CodeRabbit Critical on PR #146). Old code counted the INPUT buffer
12148/// length as `evicted_bytes` because Dfast's `add_data` callback receives
12149/// the input `Vec<u8>` for pool recycling (Dfast stores bytes in `history`,
12150/// not per-block Vecs). On the saturated-window 1:1 path the two coincide
12151/// so the previous test fixture passed by accident; this test forces the
12152/// divergent case where evicted != input by sequencing block lengths
12153/// `[4, 4, 5]` against `max_window_size = 10`:
12154///
12155///   * after 1st commit: `window_blocks = [4]`, `window_size = 4`
12156///   * after 2nd commit: `window_blocks = [4, 4]`, `window_size = 8`
12157///   * 3rd commit (5 bytes): `8 + 5 > 10` → pop one 4-byte block (evict=4),
12158///     then push 5 (window_size=9). Bug counts `5`, fix counts `4`.
12159///
12160/// The fix derives eviction from `window_size` delta + input length:
12161/// `evicted = pre + space_len - post`. Verified via the
12162/// `dictionary_retained_budget` observable: starting budget 100, after
12163/// the third commit (4 bytes actually evicted) the budget must read 96,
12164/// not 95.
12165/// Driver-path regression for the `commit_space` Dfast eviction accounting
12166/// bug. Exercises `MatchGeneratorDriver::commit_space` directly (not just
12167/// `DfastMatchGenerator::add_data`) so the assertion catches a future
12168/// regression that swaps the Dfast branch in `commit_space` back to
12169/// `evicted_bytes += data.len()` — the older draft of this regression
12170/// hand-recomputed the formula on the matcher and would pass either way.
12171///
12172/// Fixture: `max_window_size = 10`, commit sequence `[4, 4, 5]`. The
12173/// divergent case where the popped block (4 bytes) and the new input
12174/// (5 bytes) have different sizes:
12175///
12176///   * after commit `"abcd"` (4 B): window_blocks=[4], ws=4
12177///   * after commit `"efgh"` (4 B): window_blocks=[4,4], ws=8
12178///   * commit `"ijklm"` (5 B): 8+5>10 → pop front [4] (evict=4),
12179///     push 5 → window_blocks=[4,5], ws=9
12180///
12181/// `commit_space` then calls `retire_dictionary_budget(evicted)`. With
12182/// the fix `evicted=4`; with the bug it would be `evicted=5`. The
12183/// downstream `trim_after_budget_retire` cascade (which fires whenever
12184/// `retire_dictionary_budget` returns true) drives the budget further
12185/// down by trimming the now-oversize window; the final
12186/// `dictionary_retained_budget` differs between the two paths because
12187/// the cascade starting state differs (max_window_size after first
12188/// retire is `10 - evicted`).
12189///
12190/// Tracing the fix path end-to-end with starting budget = 100:
12191///   1st commit: evicted=0, no retire.
12192///   2nd commit: evicted=0, no retire.
12193///   3rd commit: evicted=4. retire(4) → budget=96, max_window=6.
12194///     trim_after_budget_retire:
12195///       iter1: ws=9 > max=6, pop [4] → ws=5, evicted=4.
12196///              retire(4) → budget=92, max_window=2.
12197///       iter2: ws=5 > max=2, pop [5] → ws=0, evicted=5.
12198///              retire(5) → budget=87, max_window=0.
12199///       iter3: ws=0, no trim, retire(0) → false, exit.
12200///   Final budget = 87. Final max_window_size = 0.
12201///
12202/// In the buggy path the 3rd commit would compute `evicted=5`, retire
12203/// would reclaim 5 instead of 4, shrinking max_window_size to 5
12204/// instead of 6 — and then the cascade arithmetic produces a
12205/// different final budget (and on the 2nd commit the cascade would
12206/// already have shrunk max_window_size to 0, causing the 3rd commit
12207/// to panic on `data.len() <= max_window_size`). Either way the
12208/// regression surfaces as a test failure.
12209#[test]
12210fn dfast_commit_space_eviction_uses_window_size_delta() {
12211    use crate::encoding::CompressionLevel;
12212
12213    let mut driver = MatchGeneratorDriver::new(10, 1);
12214    driver.reset(CompressionLevel::Level(3));
12215    assert!(matches!(driver.storage, MatcherStorage::Dfast(_)));
12216
12217    // Override the level-derived window with a tiny one so the
12218    // 4 + 4 + 5 = 13 commit sequence below actually crosses the
12219    // boundary. A 16 KiB+ default window would never evict on this
12220    // little data and the bug would stay invisible.
12221    driver.dfast_matcher_mut().max_window_size = 10;
12222    driver.dictionary_retained_budget = 100;
12223
12224    let mut space1 = Vec::with_capacity(64);
12225    space1.extend_from_slice(b"abcd");
12226    driver.commit_space(space1);
12227    assert_eq!(
12228        driver.dictionary_retained_budget, 100,
12229        "1st commit fills window 0 → 4, no eviction, no retire"
12230    );
12231
12232    let mut space2 = Vec::with_capacity(64);
12233    space2.extend_from_slice(b"efgh");
12234    driver.commit_space(space2);
12235    assert_eq!(
12236        driver.dictionary_retained_budget, 100,
12237        "2nd commit fills window 4 → 8, no eviction, no retire"
12238    );
12239
12240    let mut space3 = Vec::with_capacity(64);
12241    space3.extend_from_slice(b"ijklm");
12242    driver.commit_space(space3);
12243    assert_eq!(
12244        driver.dictionary_retained_budget, 87,
12245        "3rd commit + trim_after_budget_retire cascade. With the fix \
12246         (evicted=4 from window_size delta) the cascade reclaims 100 \
12247         → 96 → 92 → 87. With the bug (evicted=5 from data.len()) the \
12248         3rd commit would panic on `data.len() <= max_window_size` \
12249         after the 2nd commit's cascade had already shrunk \
12250         max_window_size to 0."
12251    );
12252    assert_eq!(
12253        driver.dfast_matcher_mut().max_window_size,
12254        0,
12255        "cascade drains max_window_size to 0 once budget reclaim \
12256         exceeds the initial window size"
12257    );
12258}
12259
12260#[test]
12261fn dfast_trim_to_window_evicts_oldest_block_by_length() {
12262    // After the history-only storage refactor (#111 Phase 7c step 3),
12263    // Dfast no longer retains input `Vec<u8>`s — the `history`
12264    // contiguous buffer is the sole byte store, and `add_data`
12265    // returns the input Vec to the caller's pool eagerly. So
12266    // `trim_to_window` doesn't have anything to hand back to the
12267    // closure (no Vec exists to give). The eviction is observable
12268    // instead through `window_size` shrinking by the per-block
12269    // length recorded in `window_blocks`.
12270    let mut matcher = DfastMatchGenerator::new(16);
12271
12272    let mut first = Vec::with_capacity(64);
12273    first.extend_from_slice(b"abcdefgh");
12274    matcher.add_data(first, |_| {});
12275
12276    let mut second = Vec::with_capacity(64);
12277    second.extend_from_slice(b"ijklmnop");
12278    matcher.add_data(second, |_| {});
12279
12280    assert_eq!(matcher.window_size, 16);
12281    assert_eq!(matcher.window_blocks.len(), 2);
12282
12283    matcher.max_window_size = 8;
12284
12285    matcher.trim_to_window();
12286
12287    // No callback signature to assert on: the Dfast variant of
12288    // `trim_to_window` takes none. That signature shape (vs HC/Row
12289    // which accept `impl FnMut(Vec<u8>)`) is the property locking in
12290    // the contract — there is no closure to invoke or skip, so no
12291    // future change can "start invoking the callback" without a
12292    // compile-time signature break that the dispatcher and this test
12293    // would force the author to address.
12294    assert_eq!(
12295        matcher.window_size, 8,
12296        "exactly one 8-byte block must remain"
12297    );
12298    assert_eq!(matcher.window_blocks.len(), 1);
12299    assert_eq!(matcher.history_abs_start, 8);
12300}
12301
12302#[test]
12303fn dfast_inserts_tail_positions_for_next_block_matching() {
12304    let mut matcher = DfastMatchGenerator::new(1 << 22);
12305
12306    matcher.add_data(b"012345bcdea".to_vec(), |_| {});
12307    let mut history = Vec::new();
12308    matcher.start_matching(|seq| match seq {
12309        Sequence::Literals { literals } => history.extend_from_slice(literals),
12310        Sequence::Triple { .. } => unreachable!("first block should not match history"),
12311    });
12312    assert_eq!(history, b"012345bcdea");
12313
12314    matcher.add_data(b"bcdeabcdeab".to_vec(), |_| {});
12315    let mut saw_first_sequence = false;
12316    matcher.start_matching(|seq| {
12317        assert!(!saw_first_sequence, "expected a single cross-block match");
12318        saw_first_sequence = true;
12319        match seq {
12320            Sequence::Literals { .. } => {
12321                panic!("expected tail-anchored cross-block match before any literals")
12322            }
12323            Sequence::Triple {
12324                literals,
12325                offset,
12326                match_len,
12327            } => {
12328                assert_eq!(literals, b"");
12329                assert_eq!(offset, 5);
12330                assert_eq!(match_len, 11);
12331                let start = history.len() - offset;
12332                for i in 0..match_len {
12333                    let byte = history[start + i];
12334                    history.push(byte);
12335                }
12336            }
12337        }
12338    });
12339
12340    assert!(
12341        saw_first_sequence,
12342        "expected tail-anchored cross-block match"
12343    );
12344    assert_eq!(history, b"012345bcdeabcdeabcdeab");
12345}
12346
12347/// Regression for #49 — locks down `MatchTable::backfill_boundary_positions`
12348/// for the [`HcMatchGenerator`] lazy path. `backfill_boundary_positions`
12349/// seeds ONLY the last `< 4` bytes of the previous slice (positions in
12350/// `[current_abs_start - 3, current_abs_start)`) — the bytes that
12351/// `insert_position` could not hash at the time because hashing needs
12352/// 4 bytes of lookahead. The existing 8 MiB window roundtrip test
12353/// exercises cross-slice behaviour end-to-end, but does not isolate
12354/// the backfill of those final 1-3 unhashable bytes.
12355///
12356/// Fixture is built so the cross-block match's candidate position
12357/// MUST lie in `[block_1_end - 3, block_1_end)`:
12358///
12359/// - Block 1 = `b"PQRSTBCD"` (8 bytes). Block 1's `start_matching`
12360///   hashes positions 0..=4 (each has 4 bytes of forward context);
12361///   positions 5/6/7 are the unhashable tail.
12362/// - Block 2 = `b"BCDBCDBCDB"` (10 bytes). At absolute position 8
12363///   (block 2 start) the 4-byte window is `b"BCDB"`. The ONLY place
12364///   `b"BCDB"` was inserted in the hash + chain tables is position 5
12365///   — via `backfill_boundary_positions` on the next-slice entry
12366///   (the 4-byte window at position 5 is `data[5..9] = b"BCD" +
12367///   block_2[0] = b"BCDB"`).
12368///
12369/// If `backfill_boundary_positions` regresses, position 5 is never
12370/// hashed, position 8's lookup misses, and the lazy parser falls
12371/// through to a leading literals run — `offset == 3, match_len >= 4`
12372/// would no longer hold.
12373#[test]
12374fn hashchain_inserts_tail_positions_for_next_block_matching() {
12375    let mut matcher = HcMatchGenerator::new(1 << 22);
12376    matcher.configure(HC_CONFIG, super::strategy::StrategyTag::Lazy, 22);
12377
12378    matcher.table.add_data(b"PQRSTBCD".to_vec(), |_| {});
12379    let mut history = alloc::vec::Vec::new();
12380    matcher.start_matching(|seq| match seq {
12381        Sequence::Literals { literals } => history.extend_from_slice(literals),
12382        Sequence::Triple { .. } => unreachable!("first block has no internal repeats"),
12383    });
12384    assert_eq!(history, b"PQRSTBCD");
12385
12386    matcher.table.add_data(b"BCDBCDBCDB".to_vec(), |_| {});
12387    let mut first_sequence_offset: Option<usize> = None;
12388    let mut first_sequence_match_len: Option<usize> = None;
12389    matcher.start_matching(|seq| {
12390        if first_sequence_offset.is_some() {
12391            return;
12392        }
12393        match seq {
12394            Sequence::Literals { .. } => {
12395                panic!(
12396                    "expected tail-anchored cross-block match before any literals — \
12397                     backfill_boundary_positions did not seed positions 5/6/7"
12398                )
12399            }
12400            Sequence::Triple {
12401                literals,
12402                offset,
12403                match_len,
12404            } => {
12405                assert_eq!(literals, b"", "no leading literals on the boundary match");
12406                first_sequence_offset = Some(offset);
12407                first_sequence_match_len = Some(match_len);
12408            }
12409        }
12410    });
12411
12412    let offset = first_sequence_offset.expect(
12413        "expected tail-anchored cross-block match emitted from backfill_boundary_positions",
12414    );
12415    assert!(
12416        (1..=3).contains(&offset),
12417        "boundary match offset {offset} must point into the unhashable tail \
12418         (positions 5/6/7 of an 8-byte block 1) so the test specifically \
12419         locks down backfill_boundary_positions",
12420    );
12421    assert_eq!(
12422        offset, 3,
12423        "candidate position must land at 5 (= block_1_len - 3) so the 4-byte \
12424         window `data[5..9] = b\"BCDB\"` matches block 2's first hash lookup",
12425    );
12426    let match_len = first_sequence_match_len.unwrap();
12427    assert!(
12428        match_len >= HC_MIN_MATCH_LEN,
12429        "match_len {match_len} must clear the HC min-match floor",
12430    );
12431}
12432
12433#[test]
12434fn dfast_dense_skip_matching_backfills_previous_tail_for_next_block() {
12435    let mut matcher = DfastMatchGenerator::new(1 << 22);
12436    let tail = b"Qz9kLm2Rp";
12437    let mut first = b"0123456789abcdef".to_vec();
12438    first.extend_from_slice(tail);
12439    matcher.add_data(first.clone(), |_| {});
12440    matcher.skip_matching(Some(false));
12441
12442    let mut second = tail.to_vec();
12443    second.extend_from_slice(b"after-tail-literals");
12444    matcher.add_data(second, |_| {});
12445
12446    let mut first_sequence = None;
12447    matcher.start_matching(|seq| {
12448        if first_sequence.is_some() {
12449            return;
12450        }
12451        first_sequence = Some(match seq {
12452            Sequence::Literals { literals } => (literals.len(), 0usize, 0usize),
12453            Sequence::Triple {
12454                literals,
12455                offset,
12456                match_len,
12457            } => (literals.len(), offset, match_len),
12458        });
12459    });
12460
12461    let (lit_len, offset, match_len) = first_sequence.expect("expected at least one sequence");
12462    assert_eq!(
12463        lit_len, 0,
12464        "expected immediate cross-block match at block start"
12465    );
12466    assert_eq!(
12467        offset,
12468        tail.len(),
12469        "expected dense skip to preserve cross-boundary tail match"
12470    );
12471    assert!(
12472        match_len >= DFAST_MIN_MATCH_LEN,
12473        "match length should satisfy dfast minimum match length"
12474    );
12475}
12476
12477#[test]
12478fn dfast_sparse_skip_matching_preserves_tail_cross_block_match() {
12479    let mut matcher = DfastMatchGenerator::new(1 << 22);
12480    let tail = b"Qz9kLm2Rp";
12481    let mut first = deterministic_high_entropy_bytes(0x9E37_79B9_7F4A_7C15, 4096);
12482    let tail_start = first.len() - tail.len();
12483    first[tail_start..].copy_from_slice(tail);
12484    matcher.add_data(first.clone(), |_| {});
12485
12486    matcher.skip_matching(Some(true));
12487
12488    let mut second = tail.to_vec();
12489    second.extend_from_slice(b"after-tail-literals");
12490    matcher.add_data(second, |_| {});
12491
12492    let mut first_sequence = None;
12493    matcher.start_matching(|seq| {
12494        if first_sequence.is_some() {
12495            return;
12496        }
12497        first_sequence = Some(match seq {
12498            Sequence::Literals { literals } => (literals.len(), 0usize, 0usize),
12499            Sequence::Triple {
12500                literals,
12501                offset,
12502                match_len,
12503            } => (literals.len(), offset, match_len),
12504        });
12505    });
12506
12507    let (lit_len, offset, match_len) = first_sequence.expect("expected at least one sequence");
12508    assert_eq!(
12509        lit_len, 0,
12510        "expected immediate cross-block match at block start"
12511    );
12512    assert_eq!(
12513        offset,
12514        tail.len(),
12515        "expected match against densely seeded tail"
12516    );
12517    assert!(
12518        match_len >= DFAST_MIN_MATCH_LEN,
12519        "match length should satisfy dfast minimum match length"
12520    );
12521}
12522
12523#[test]
12524fn dfast_skip_matching_dense_backfills_newly_hashable_long_tail_positions() {
12525    let mut matcher = DfastMatchGenerator::new(1 << 22);
12526    let first = deterministic_high_entropy_bytes(0x7A64_0315_D4E1_91C3, 4096);
12527    let first_len = first.len();
12528    matcher.add_data(first, |_| {});
12529    matcher.skip_matching_dense();
12530
12531    // Appending one byte makes exactly the previous block's last 7 starts
12532    // newly eligible for 8-byte long-hash insertion.
12533    matcher.add_data(alloc::vec![0xAB], |_| {});
12534    matcher.skip_matching_dense();
12535
12536    let target_abs_pos = first_len - 7;
12537    let target_rel = target_abs_pos - matcher.history_abs_start;
12538    let live = matcher.live_history();
12539    assert!(
12540        target_rel + 8 <= live.len(),
12541        "fixture must make the boundary start long-hashable"
12542    );
12543    let long_hash = matcher.long_hash_index(&live[target_rel..]);
12544    let target_slot = matcher.pack_slot(target_abs_pos);
12545    // Single-slot tables (upstream zstd parity): the bucket holds at most one
12546    // u32; the assertion below is a direct equality (no `.contains`).
12547    assert_ne!(
12548        target_slot, DFAST_EMPTY_SLOT,
12549        "pack_slot must never return the empty-slot sentinel for a real position"
12550    );
12551    assert_eq!(
12552        matcher.tables[long_hash], target_slot,
12553        "dense skip must seed long-hash entry for newly hashable boundary start"
12554    );
12555}
12556
12557#[test]
12558fn dfast_seed_remaining_hashable_starts_seeds_last_short_hash_positions() {
12559    let mut matcher = DfastMatchGenerator::new(1 << 20);
12560    let block = deterministic_high_entropy_bytes(0x13F0_9A6D_55CE_7B21, 64);
12561    matcher.add_data(block, |_| {});
12562    matcher.ensure_hash_tables();
12563
12564    let current_len = matcher.window_blocks.back().copied().unwrap_or(0);
12565    let current_abs_start = matcher.history_abs_start + matcher.window_size - current_len;
12566    let seed_start = current_len - DFAST_MIN_MATCH_LEN;
12567    matcher.seed_remaining_hashable_starts(current_abs_start, current_len, seed_start);
12568
12569    let target_abs_pos = current_abs_start + current_len - 5;
12570    let target_rel = target_abs_pos - matcher.history_abs_start;
12571    let live = matcher.live_history();
12572    assert!(
12573        target_rel + 5 <= live.len(),
12574        "fixture must leave the last short-hash start valid"
12575    );
12576    let short_hash = matcher.short_hash_index(&live[target_rel..]);
12577    let target_slot = matcher.pack_slot(target_abs_pos);
12578    assert_ne!(
12579        target_slot, DFAST_EMPTY_SLOT,
12580        "pack_slot must never return the empty-slot sentinel for a real position"
12581    );
12582    assert_eq!(
12583        matcher.tables[matcher.long_len() + short_hash],
12584        target_slot,
12585        "tail seeding must include the last 5-byte-hashable start"
12586    );
12587}
12588
12589#[test]
12590fn dfast_seed_remaining_hashable_starts_handles_pos_at_block_end() {
12591    let mut matcher = DfastMatchGenerator::new(1 << 20);
12592    let block = deterministic_high_entropy_bytes(0x7BB2_DA91_441E_C0EF, 64);
12593    matcher.add_data(block, |_| {});
12594    matcher.ensure_hash_tables();
12595
12596    let current_len = matcher.window_blocks.back().copied().unwrap_or(0);
12597    let current_abs_start = matcher.history_abs_start + matcher.window_size - current_len;
12598    matcher.seed_remaining_hashable_starts(current_abs_start, current_len, current_len);
12599
12600    let target_abs_pos = current_abs_start + current_len - 5;
12601    let target_rel = target_abs_pos - matcher.history_abs_start;
12602    let live = matcher.live_history();
12603    assert!(
12604        target_rel + 5 <= live.len(),
12605        "fixture must leave the last short-hash start valid"
12606    );
12607    let short_hash = matcher.short_hash_index(&live[target_rel..]);
12608    let target_slot = matcher.pack_slot(target_abs_pos);
12609    assert_ne!(
12610        target_slot, DFAST_EMPTY_SLOT,
12611        "pack_slot must never return the empty-slot sentinel for a real position"
12612    );
12613    assert_eq!(
12614        matcher.tables[matcher.long_len() + short_hash],
12615        target_slot,
12616        "tail seeding must still include the last 5-byte-hashable start when pos is at block end"
12617    );
12618}
12619
12620/// `ensure_room_for` must trigger `reduce()` when the requested
12621/// absolute position would push a relative offset past
12622/// `u32::MAX - DFAST_REBASE_GUARD_BAND`. After the rebase, the
12623/// pre-existing entry at a much-smaller absolute position falls
12624/// below `reducer` and gets cleared to `DFAST_EMPTY_SLOT`; a fresh
12625/// insert at the boundary position must `pack_slot` to a valid
12626/// non-sentinel value that `unpack_slot` resolves back to the same
12627/// absolute position. Mirrors `LdmHashTable::ensure_room_for_*`
12628/// from PR #139.
12629///
12630/// Runs on every target — `trigger_abs = u32::MAX -
12631/// DFAST_REBASE_GUARD_BAND + 1 = 0xC0000000`, which fits in `usize`
12632/// on i686 (`usize::MAX = u32::MAX`) without overflow, so the
12633/// packed-slot boundary path + u32 ↔ usize round-trip is exercised
12634/// on every pointer width we ship.
12635#[test]
12636fn dfast_ensure_room_for_rebases_above_guard_band() {
12637    let mut dfast = DfastMatchGenerator::new(1 << 22);
12638    dfast.set_hash_bits(10, 10);
12639    dfast.ensure_hash_tables();
12640
12641    // Seed an early insert near the current base in BOTH tables.
12642    // `ensure_room_for` / `reduce` is a shared contract for both
12643    // `short_hash` and `long_hash`; without seeding both, a
12644    // regression that only cleared short_hash would still pass.
12645    // Direct `pack_slot` + bucket write keeps the test focused on
12646    // the rebase mechanics and avoids dragging in the full
12647    // `insert_position` flow with its history/window setup.
12648    let early_abs = 1024usize;
12649    let early_packed = dfast.pack_slot(early_abs);
12650    assert_ne!(early_packed, DFAST_EMPTY_SLOT);
12651    let short0 = dfast.long_len();
12652    dfast.tables[short0] = early_packed;
12653    dfast.tables[0] = early_packed;
12654
12655    // Pick a trigger position that forces the first rebase. With
12656    // `position_base = 0`, the smallest `abs_pos` that fails the
12657    // `rel <= max_rel` test is `u32::MAX - DFAST_REBASE_GUARD_BAND
12658    // + 1`. After one `reduce(DFAST_REBASE_GUARD_BAND)` the base
12659    // advances by `DFAST_REBASE_GUARD_BAND`.
12660    let trigger_abs = (u32::MAX as usize) - (DFAST_REBASE_GUARD_BAND as usize) + 1;
12661    assert_eq!(dfast.position_base, 0);
12662    dfast.ensure_room_for(trigger_abs);
12663    assert_eq!(
12664        dfast.position_base, DFAST_REBASE_GUARD_BAND as usize,
12665        "rebase must advance position_base by DFAST_REBASE_GUARD_BAND"
12666    );
12667
12668    // The early entry at abs=1024 had packed slot 1025; the rebase
12669    // subtracts `DFAST_REBASE_GUARD_BAND` (= 2^30) from every slot.
12670    // 1025 <= 2^30 so the slot drops to the empty sentinel —
12671    // upstream zstd parity for `ZSTD_window_reduce`'s clamp-at-zero rule.
12672    // Verify BOTH tables — `reduce()` walks them in sequence.
12673    assert_eq!(
12674        dfast.tables[dfast.long_len()],
12675        DFAST_EMPTY_SLOT,
12676        "pre-rebase short-hash entries below the reducer must become empty"
12677    );
12678    assert_eq!(
12679        dfast.tables[0], DFAST_EMPTY_SLOT,
12680        "pre-rebase long-hash entries below the reducer must become empty"
12681    );
12682
12683    // A fresh insert past the rebase boundary must round-trip:
12684    // pack to a non-sentinel value, then unpack back to the same
12685    // absolute position via `position_base + slot - 1`.
12686    let post_packed = dfast.pack_slot(trigger_abs);
12687    assert_ne!(post_packed, DFAST_EMPTY_SLOT);
12688    let unpacked = dfast.position_base + (post_packed as usize) - 1;
12689    assert_eq!(
12690        unpacked, trigger_abs,
12691        "post-rebase pack/unpack must round-trip the absolute position"
12692    );
12693}
12694
12695#[test]
12696fn dfast_sparse_skip_matching_backfills_previous_tail_for_consecutive_sparse_blocks() {
12697    let mut matcher = DfastMatchGenerator::new(1 << 22);
12698    let boundary_prefix = [0xFA, 0xFB, 0xFC];
12699    let boundary_suffix = [0xFD, 0xEE, 0xAD, 0xBE, 0xEF, 0x11, 0x22, 0x33];
12700
12701    let mut first = deterministic_high_entropy_bytes(0xA5A5_5A5A_C3C3_3C3C, 4096);
12702    let first_tail_start = first.len() - boundary_prefix.len();
12703    first[first_tail_start..].copy_from_slice(&boundary_prefix);
12704    matcher.add_data(first, |_| {});
12705    matcher.skip_matching(Some(true));
12706
12707    let mut second = deterministic_high_entropy_bytes(0xA5A5_5A5A_C3C3_3C3C, 4096);
12708    second[..boundary_suffix.len()].copy_from_slice(&boundary_suffix);
12709    matcher.add_data(second.clone(), |_| {});
12710    matcher.skip_matching(Some(true));
12711
12712    let mut third = boundary_prefix.to_vec();
12713    third.extend_from_slice(&boundary_suffix);
12714    third.extend_from_slice(b"-trailing-literals");
12715    matcher.add_data(third, |_| {});
12716
12717    let mut first_sequence = None;
12718    matcher.start_matching(|seq| {
12719        if first_sequence.is_some() {
12720            return;
12721        }
12722        first_sequence = Some(match seq {
12723            Sequence::Literals { literals } => (literals.len(), 0usize, 0usize),
12724            Sequence::Triple {
12725                literals,
12726                offset,
12727                match_len,
12728            } => (literals.len(), offset, match_len),
12729        });
12730    });
12731
12732    let (lit_len, offset, match_len) = first_sequence.expect("expected at least one sequence");
12733    assert_eq!(
12734        lit_len, 0,
12735        "expected immediate match from the prior sparse-skip boundary"
12736    );
12737    assert_eq!(
12738        offset,
12739        second.len() + boundary_prefix.len(),
12740        "expected match against backfilled first→second boundary start"
12741    );
12742    assert!(
12743        match_len >= DFAST_MIN_MATCH_LEN,
12744        "match length should satisfy dfast minimum match length"
12745    );
12746}
12747
12748#[test]
12749fn fastest_hint_iteration_23_sequences_reconstruct_source() {
12750    fn generate_data(seed: u64, len: usize) -> Vec<u8> {
12751        let mut state = seed;
12752        let mut data = Vec::with_capacity(len);
12753        for _ in 0..len {
12754            state = state
12755                .wrapping_mul(6364136223846793005)
12756                .wrapping_add(1442695040888963407);
12757            data.push((state >> 33) as u8);
12758        }
12759        data
12760    }
12761
12762    let i = 23u64;
12763    let len = (i * 89 % 16384) as usize;
12764    let mut data = generate_data(i, len);
12765    // Append a repeated slice so the fixture deterministically exercises
12766    // the match path (Sequence::Triple) instead of only literals.
12767    let repeat = data[128..256].to_vec();
12768    data.extend_from_slice(&repeat);
12769    data.extend_from_slice(&repeat);
12770
12771    let mut driver = MatchGeneratorDriver::new(1024 * 128, 1);
12772    driver.set_source_size_hint(data.len() as u64);
12773    driver.reset(CompressionLevel::Fastest);
12774    let mut space = driver.get_next_space();
12775    space[..data.len()].copy_from_slice(&data);
12776    space.truncate(data.len());
12777    driver.commit_space(space);
12778
12779    let mut rebuilt = Vec::with_capacity(data.len());
12780    let mut saw_triple = false;
12781    driver.start_matching(|seq| match seq {
12782        Sequence::Literals { literals } => rebuilt.extend_from_slice(literals),
12783        Sequence::Triple {
12784            literals,
12785            offset,
12786            match_len,
12787        } => {
12788            saw_triple = true;
12789            rebuilt.extend_from_slice(literals);
12790            assert!(offset > 0, "offset must be non-zero");
12791            assert!(
12792                offset <= rebuilt.len(),
12793                "offset must reference already-produced bytes: offset={} produced={}",
12794                offset,
12795                rebuilt.len()
12796            );
12797            let start = rebuilt.len() - offset;
12798            for idx in 0..match_len {
12799                let b = rebuilt[start + idx];
12800                rebuilt.push(b);
12801            }
12802        }
12803    });
12804
12805    // Whether THIS specific iteration produces a Triple depends on
12806    // the matcher's step-skip schedule (upstream zstd-shape kernel walks ip0
12807    // with kSearchStrength-driven stride growth) — the legacy
12808    // SuffixStore-based matcher iterated every position and always
12809    // hit short repeats, but the upstream zstd-shape kernel may skip over
12810    // them when the step has grown large by the time it reaches the
12811    // repeat region. The substance of this test is the
12812    // reconstruction assertion below; `saw_triple` was a legacy
12813    // tuning preference, not a correctness invariant.
12814    let _ = saw_triple;
12815    assert_eq!(rebuilt, data);
12816}
12817
12818#[test]
12819fn fast_levels_dispatch_per_level_hash_log_and_mls() {
12820    // Level 1 — upstream zstd `{ 19, 13, 14, 1, 7, 0, ZSTD_fast }` row:
12821    // window_log=19, hash_log=14, mls=7.
12822    let f1 = resolve_level_params(CompressionLevel::Level(1), None)
12823        .fast
12824        .unwrap();
12825    assert_eq!(f1.hash_log, 14);
12826    assert_eq!(f1.mls, 7);
12827    assert_eq!(f1.step_size, 2);
12828
12829    // Negative levels — upstream zstd row-0 ("base for negative"):
12830    // hash_log=13, mls=7. The 32 KiB table is L1d-resident (every
12831    // probe an L1 hit, vs an L2 access for a 64 KiB hash_log=14
12832    // table), and minMatch=7 drops short-distance 6-byte matches —
12833    // upstream zstd parity on both ratio and throughput.
12834    // step_size follows upstream zstd's formula: targetLength = -level,
12835    // step_size = (-level) + 1, giving 2..8 for L-1..L-7.
12836    for n in -7..=-1 {
12837        let f = resolve_level_params(CompressionLevel::Level(n), None)
12838            .fast
12839            .unwrap();
12840        assert_eq!(f.hash_log, 13, "Level({n}) fast_hash_log");
12841        assert_eq!(f.mls, 7, "Level({n}) fast_mls");
12842        let expected_step = ((-n) as usize) + 1;
12843        assert_eq!(f.step_size, expected_step, "Level({n}) fast_step_size");
12844    }
12845
12846    // Fastest + Uncompressed keep hash_log=14 / mls=6 (their own
12847    // tuning; not part of the negative-level upstream zstd ladder).
12848    let pf = resolve_level_params(CompressionLevel::Fastest, None);
12849    let ff = pf.fast.unwrap();
12850    assert_eq!(
12851        (pf.window_log, ff.hash_log, ff.mls, ff.step_size),
12852        (19, 14, 6, 2),
12853    );
12854    // Uncompressed keeps window_log=17 (no history references, smaller
12855    // decoder reservation); fast cParams same as negative-base row.
12856    let pu = resolve_level_params(CompressionLevel::Uncompressed, None);
12857    let fu = pu.fast.unwrap();
12858    assert_eq!(
12859        (pu.window_log, fu.hash_log, fu.mls, fu.step_size),
12860        (17, 14, 6, 2),
12861    );
12862}
12863
12864/// Exercise the actual driver wiring: for every Fast level, reset a
12865/// `MatchGeneratorDriver` and assert the inner `FastKernelMatcher`
12866/// observed the same `(hash_log, mls, step_size)` tuple that
12867/// `resolve_level_params` reports. Catches plumbing bugs — argument
12868/// reordering, stale step_size carried from a prior frame,
12869/// stuck-on-default values — that the parameter-only test above
12870/// would miss.
12871#[test]
12872fn fast_levels_driver_wiring_threads_cparams_into_inner_matcher() {
12873    let mut driver = MatchGeneratorDriver::new(64 * 1024, 1);
12874
12875    let fast_levels = [
12876        CompressionLevel::Level(1),
12877        CompressionLevel::Fastest,
12878        CompressionLevel::Uncompressed,
12879        CompressionLevel::Level(-1),
12880        CompressionLevel::Level(-2),
12881        CompressionLevel::Level(-3),
12882        CompressionLevel::Level(-4),
12883        CompressionLevel::Level(-5),
12884        CompressionLevel::Level(-6),
12885        CompressionLevel::Level(-7),
12886    ];
12887
12888    for &level in &fast_levels {
12889        let p = resolve_level_params(level, None);
12890        // Sanity: every level in the table above must resolve to a
12891        // Fast-strategy row — otherwise this test isn't testing what
12892        // it claims to test.
12893        assert_eq!(
12894            p.strategy_tag,
12895            super::strategy::StrategyTag::Fast,
12896            "{level:?} must resolve to Fast strategy",
12897        );
12898
12899        // Bounce through a non-Fast strategy first so the next
12900        // reset actually goes through the backend-switch path
12901        // (`MatchGeneratorDriver::new` / `simple_mut` recreate the
12902        // Fast variant via `FastKernelMatcher::with_params`). Without
12903        // this hop the loop would only ever stay in `BackendTag::Simple`
12904        // and exercise `FastKernelMatcher::reset` — leaving the
12905        // `with_params` wiring untested on the production path.
12906        // `Default` resolves to Dfast strategy (a non-Fast row),
12907        // which is enough to force the swap.
12908        crate::encoding::Matcher::reset(&mut driver, CompressionLevel::Default);
12909
12910        // Drive the production reset path (same code paths exercised
12911        // by FrameCompressor / StreamingEncoder).
12912        crate::encoding::Matcher::reset(&mut driver, level);
12913
12914        let f = p.fast.unwrap();
12915        let m = driver.simple_mut();
12916        assert_eq!(
12917            m.hash_log(),
12918            f.hash_log,
12919            "{level:?}: inner matcher hash_log mismatch — argument swap?",
12920        );
12921        assert_eq!(
12922            m.mls(),
12923            f.mls,
12924            "{level:?}: inner matcher mls mismatch — argument swap?",
12925        );
12926        assert_eq!(
12927            m.step_size(),
12928            f.step_size,
12929            "{level:?}: inner matcher step_size mismatch — stale value carried from prior reset?",
12930        );
12931    }
12932}
12933
12934/// Pins `hc.target_len` to the reference `cParams.targetLength` from
12935/// `clevels.h` table[0] (default — `srcSize > 256 KB`) across levels
12936/// 5-15. The reference's lazy outer loop treats `targetLength` as
12937/// `sufficient_len` — the "nice match" threshold that breaks the chain
12938/// walk as soon as a candidate reaches that length.
12939///
12940/// Levels 13-15 run btlazy2 in the reference and the hash-chain Lazy
12941/// parser here, but the reference `targetLength` (32) is the same nice-match
12942/// threshold for both finders, so we mirror it directly.
12943///
12944/// Asserts against the constant `clevels.h` table[0] `targetLength` column
12945/// (transcribed inline) — a pure-Rust in-tree test, no FFI dependency.
12946#[test]
12947fn lazy_band_target_len_matches_default_table() {
12948    // table[0] (srcSize > 256 KB) targetLength, levels 5..=15: the lazy
12949    // outer loop's nice-match (`sufficient_len`) threshold.
12950    let expected: [(i32, usize); 11] = [
12951        (5, 2),
12952        (6, 4),
12953        (7, 8),
12954        (8, 16),
12955        (9, 16),
12956        (10, 16),
12957        (11, 16),
12958        (12, 32),
12959        (13, 32),
12960        (14, 32),
12961        (15, 32),
12962    ];
12963    for (level, want) in expected {
12964        let params = resolve_level_params(CompressionLevel::Level(level), None);
12965        // L5 = greedy (Row backend → `row`); L6-15 = lazy (HashChain → `hc`).
12966        let target_len = params
12967            .hc
12968            .map(|hc| hc.target_len)
12969            .or_else(|| params.row.map(|row| row.target_len))
12970            .expect("lazy/greedy level carries hc or row config");
12971        assert_eq!(target_len, want, "L{level}: target_len must match table[0]");
12972    }
12973}
12974
12975/// Levels 13-15 mirror the reference btlazy2 window/hash/chain/search
12976/// budget from `clevels.h` table[0]: `search_depth == 1 << cParams.searchLog`
12977/// (16 / 32 / 64) plus `window_log` / `hash_log` / `chain_log` equal to the
12978/// reference `windowLog` / `hashLog` / `chainLog`. We run them on the
12979/// hash-chain Lazy parser rather than a binary-tree finder, so they do not
12980/// re-establish a strict ratio ladder above L12 on window-fitting inputs;
12981/// asserting the full row (not just `search_depth`) keeps the whole budget
12982/// aligned and guards every field against silent drift.
12983#[test]
12984fn upper_lazy_band_params_match_default_table() {
12985    // table[0] (srcSize > 256 KB), levels 13..=15 (btlazy2 budget):
12986    // (level, windowLog, hashLog, chainLog, search_depth = 1 << searchLog).
12987    let expected: [(i32, u8, usize, usize, usize); 3] = [
12988        (13, 22, 22, 22, 1 << 4),
12989        (14, 22, 23, 22, 1 << 5),
12990        (15, 22, 23, 23, 1 << 6),
12991    ];
12992    for (level, wlog, hlog, clog, sd) in expected {
12993        let params = resolve_level_params(CompressionLevel::Level(level), None);
12994        let hc = params.hc.unwrap();
12995        assert_eq!(hc.search_depth, sd, "L{level}: search_depth");
12996        assert_eq!(params.window_log, wlog, "L{level}: window_log");
12997        assert_eq!(hc.hash_log, hlog, "L{level}: hash_log");
12998        assert_eq!(hc.chain_log, clog, "L{level}: chain_log");
12999    }
13000}
structured_zstd/encoding/match_generator.rs

structured_zstd/encoding/
match_generator.rs