marque_engine/
engine.rs

1// SPDX-FileCopyrightText: 2026 Knitli Inc.
2//
3// SPDX-License-Identifier: LicenseRef-MarqueLicense-1.0
4
5//! `Engine` — the configured, ready-to-run pipeline.
6
7use crate::clock::{Clock, SystemClock};
8use crate::errors::{EngineConstructionError, EngineError};
9use crate::options::{FixOptions, LintOptions};
10use crate::output::{FixResult, LintResult};
11use crate::recognizer::shift_token_spans;
12use crate::scheduler::schedule_rewrites;
13use aho_corasick::AhoCorasick;
14use marque_capco::CapcoScheme;
15use marque_capco::provenance::DecoderProvenance;
16use marque_config::Config;
17use marque_ism::Span;
18use marque_rules::{
19    AppliedFix, CORRECTIONS_MAP_CITATION, Confidence, Diagnostic, EnginePromotionToken,
20    FixProposal, FixSource, RuleId, RuleSet, Severity,
21};
22use marque_scheme::ambiguity::Parsed;
23use marque_scheme::recognizer::{ParseContext, Recognizer};
24use marque_scheme::{MarkingScheme, RewriteId};
25use std::collections::HashMap;
26use std::panic::AssertUnwindSafe;
27use std::sync::Arc;
28// See note in `options.rs` — `web_time::Instant` is `std::time::Instant`
29// on native and a Performance.now() polyfill on wasm32-unknown-unknown.
30use web_time::Instant;
31
32/// Cooperative-cancellation predicate (spec 005 §R3). Centralizing this
33/// in one helper keeps the wall-clock comparison consistent across every
34/// deadline check site (`lint_with_options` pre-pass, per-candidate,
35/// `fix_inner` post-lint, per-fix-application). The predicate is `now >=
36/// deadline`, so a deadline equal to the current `Instant` triggers
37/// cancellation — the spec's "expired" semantics.
38#[inline]
39fn deadline_expired(deadline: Option<Instant>) -> bool {
40    deadline.is_some_and(|d| Instant::now() >= d)
41}
42
43/// Synthetic rule identifier the engine attaches to decoder-path
44/// `FixSource::DecoderPosterior` diagnostics emitted from
45/// `Engine::lint`. Phase 4 PR-4b mints this identifier so the
46/// recognition-layer rewrite carries a real `RuleId` (rules and
47/// fixes share that requirement) without colliding with any CAPCO
48/// `E### / W### / C### / S###` namespace. A diagnostic stamped
49/// `R001` originates from the decoder, not from a CAPCO rule.
50const DECODER_RULE_ID: &str = "R001";
51
52/// Citation attached to `R001 decoder-recognition` diagnostics. Points
53/// at CAPCO-2016 §A.6 — the canonical-marking-form section the decoder
54/// is enforcing. Per Constitution VIII the citation is verifiable: §A.6
55/// is "(U) Formatting" beginning on page 15 (table of contents,
56/// `crates/capco/docs/CAPCO-2016.md` line 49) and contains the
57/// canonical syntax for portion / banner / CAB markings the decoder
58/// canonicalizes input toward.
59const DECODER_CITATION: &str = "CAPCO-2016 §A.6 p15";
60
61/// Whether to apply fixes or just simulate (dry-run).
62#[derive(Debug, Clone, Copy, PartialEq, Eq)]
63pub enum FixMode {
64    /// Apply fixes to the source text.
65    Apply,
66    /// Simulate fixes — audit stream is identical but source is unchanged.
67    DryRun,
68}
69
70/// Error returned when a caller supplies a runtime confidence threshold
71/// override that is outside the valid `[0.0, 1.0]` range.
72#[derive(Debug, Clone, Copy, PartialEq)]
73pub struct InvalidThreshold(pub f32);
74
75impl std::fmt::Display for InvalidThreshold {
76    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
77        write!(
78            f,
79            "confidence threshold {} is outside [0.0, 1.0] or is NaN",
80            self.0
81        )
82    }
83}
84
85impl std::error::Error for InvalidThreshold {}
86
87/// A configured engine instance.
88pub struct Engine {
89    config: Config,
90    rule_sets: Vec<Box<dyn RuleSet>>,
91    clock: Box<dyn Clock>,
92    /// Corrections map wrapped in Arc once at construction time so that each
93    /// `RuleContext` clone in `lint()` is an O(1) refcount bump, not a
94    /// deep-clone of the entire HashMap.
95    corrections_arc: Option<Arc<HashMap<String, String>>>,
96    /// Pre-built Aho-Corasick automaton for pre-scanner text corrections.
97    /// Built once at construction time from the corrections map (excluding
98    /// no-op and "//" entries). `None` when the corrections map is empty or
99    /// all entries are filtered out.
100    corrections_ac: Option<CachedAhoCorasick>,
101    /// Topologically-sorted rewrite ids, computed once at construction
102    /// time from the scheme's `page_rewrites()` declaration. The order
103    /// satisfies: for every edge `a → b` (rewrite `a` writes a
104    /// category `b` reads), `a` appears before `b`. When dataflow
105    /// edges fully determine the order, FR-007's declaration-order-
106    /// independence guarantee holds; when two rewrites have no edge
107    /// between them, the scheduler breaks the tie by declaration
108    /// order (Kahn's algorithm seeded in declaration order). Empty
109    /// when the scheme declares no rewrites.
110    scheduled_rewrites: Box<[RewriteId]>,
111    /// Recognizer used by `lint()` to resolve each scanner candidate to
112    /// an `IsmAttributes`. Held behind `Arc<dyn Recognizer>` so callers
113    /// can override the default via [`Engine::with_recognizer`] without
114    /// touching the lint loop. Shared across threads unchanged — the
115    /// recognizer trait is `Send + Sync` and `BatchEngine` workers hold
116    /// the same `Arc` reference (Constitution VI, FR-023).
117    ///
118    /// Default: [`StrictOrDecoderRecognizer`] — strict-first dispatch
119    /// with a decoder fallback on strict-parse zero-candidate. The
120    /// decoder recovers mangled markings that are edit-distance-1/2,
121    /// token-reordered, superseded, or case-mangled from a real
122    /// CAPCO-2016 marking. Live-typing surfaces concerned with
123    /// per-keystroke latency are expected to debounce their calls into
124    /// the engine; surfaces that need to pin strict-only behavior (the
125    /// SC-001 interactive-latency benchmark, tests asserting strict
126    /// dispatch) install [`StrictRecognizer`] explicitly via
127    /// [`Engine::with_recognizer`].
128    recognizer: Arc<dyn Recognizer<CapcoScheme>>,
129
130    /// CLI-supplied corpus override (Phase 4 PR-5 / FR-013 / T069).
131    /// Held only behind the `corpus-override` Cargo feature so the
132    /// WASM artifact and the `marque-server` build cannot
133    /// accidentally accept one through any code path.
134    ///
135    /// The decoder does not yet substitute these priors into scoring
136    /// — PR-5 minimal scope wires the surface end-to-end and stamps
137    /// every decoder fix with
138    /// [`marque_rules::FeatureId::CorpusOverrideInEffect`] in the
139    /// audit record so an auditor can identify fixes produced under
140    /// organizational overrides vs. stock priors. The prior-
141    /// substitution wiring is the next-PR step; this field is the
142    /// seam.
143    #[cfg(feature = "corpus-override")]
144    corpus_override: Option<std::sync::Arc<marque_config::corpus_override::CorpusOverride>>,
145}
146
147/// Cached AhoCorasick automaton + the active (key, value) pairs that
148/// correspond to its pattern indices.
149struct CachedAhoCorasick {
150    ac: AhoCorasick,
151    /// Active correction pairs, indexed by `PatternID::as_usize()`.
152    active: Vec<(Box<str>, Box<str>)>,
153}
154
155impl Engine {
156    /// Create a new engine with the given configuration, rule sets, and
157    /// marking scheme.
158    ///
159    /// Runs the page-rewrite scheduler (Kahn's algorithm over the
160    /// scheme's declared `reads` / `writes` axes) once at construction
161    /// time. Cycles and unannotated `Custom` rewrites fail closed with
162    /// [`EngineConstructionError`] rather than degrading at lint time.
163    ///
164    /// Use [`Engine::with_clock`] for deterministic-timestamp testing.
165    pub fn new<S: MarkingScheme>(
166        config: Config,
167        rule_sets: Vec<Box<dyn RuleSet>>,
168        scheme: S,
169    ) -> Result<Self, EngineConstructionError> {
170        Self::with_clock(config, rule_sets, scheme, Box::new(SystemClock))
171    }
172
173    /// Create an engine with a custom clock (for deterministic tests).
174    pub fn with_clock<S: MarkingScheme>(
175        mut config: Config,
176        rule_sets: Vec<Box<dyn RuleSet>>,
177        scheme: S,
178        clock: Box<dyn Clock>,
179    ) -> Result<Self, EngineConstructionError> {
180        // Canonicalize [rules] overrides against the registered rule
181        // set: accept either the rule ID (e.g. "E001") or the rule
182        // name (e.g. "portion-mark-in-banner"), resolve both to the
183        // canonical ID before the engine stores the map, and hard-fail
184        // on any unknown key. See `canonicalize_rule_overrides`.
185        canonicalize_rule_overrides(&mut config, &rule_sets)?;
186
187        let scheduled_rewrites = schedule_rewrites(scheme.page_rewrites())?;
188        // Take ownership of the corrections map instead of cloning —
189        // nothing reads config.corrections after construction.
190        let corrections_arc = if config.corrections.is_empty() {
191            None
192        } else {
193            Some(Arc::new(std::mem::take(&mut config.corrections)))
194        };
195
196        // Pre-build the AhoCorasick automaton for pre-scanner text corrections.
197        // This is O(total pattern bytes) and done once, not per-lint call.
198        let corrections_ac = corrections_arc.as_ref().and_then(|corrections| {
199            // Sort by key for deterministic pattern ordering — HashMap
200            // iteration order is random (hash seed varies per process),
201            // and AhoCorasick pattern IDs depend on insertion order.
202            let mut active: Vec<(Box<str>, Box<str>)> = corrections
203                .iter()
204                .filter(|(k, v)| k != v && k.as_str() != "//")
205                .map(|(k, v)| (k.as_str().into(), v.as_str().into()))
206                .collect();
207            active.sort_by(|(a, _), (b, _)| a.cmp(b));
208            if active.is_empty() {
209                return None;
210            }
211            let patterns: Vec<&str> = active.iter().map(|(k, _)| k.as_ref()).collect();
212            match AhoCorasick::new(&patterns) {
213                Ok(ac) => Some(CachedAhoCorasick { ac, active }),
214                Err(e) => {
215                    tracing::warn!(
216                        "failed to build AhoCorasick automaton for corrections map \
217                         ({} patterns): {e}; pre-scanner text corrections disabled",
218                        patterns.len()
219                    );
220                    None
221                }
222            }
223        });
224
225        Ok(Self {
226            config,
227            rule_sets,
228            clock,
229            corrections_arc,
230            corrections_ac,
231            scheduled_rewrites,
232            recognizer: Arc::new(crate::decoder::StrictOrDecoderRecognizer::new()),
233            #[cfg(feature = "corpus-override")]
234            corpus_override: None,
235        })
236    }
237
238    /// The topologically-sorted rewrite order computed by the scheduler
239    /// at construction time.
240    ///
241    /// Exposed for diagnostic / test inspection. Per-document lint does
242    /// not re-sort; this slice is the canonical order every page roll-up
243    /// walks.
244    pub fn scheduled_rewrites(&self) -> &[RewriteId] {
245        &self.scheduled_rewrites
246    }
247
248    /// Override the engine's recognizer. The default installed by
249    /// [`Engine::new`] is [`StrictOrDecoderRecognizer`] (strict-first,
250    /// decoder fallback). Callers that need to pin a different dispatch
251    /// — most commonly [`StrictRecognizer`] for the SC-001 interactive-
252    /// latency benchmark or tests asserting strict-only behavior —
253    /// install one explicitly here.
254    ///
255    /// Returns the engine by value so callers can chain:
256    ///
257    /// ```ignore
258    /// let engine = Engine::new(config, rules, scheme)?
259    ///     .with_recognizer(Arc::new(StrictRecognizer::new()));
260    /// ```
261    #[must_use = "with_recognizer returns a new Engine; the returned value must be bound for the override to take effect"]
262    pub fn with_recognizer(mut self, recognizer: Arc<dyn Recognizer<CapcoScheme>>) -> Self {
263        self.recognizer = recognizer;
264        self
265    }
266
267    /// Install a CLI-supplied corpus override. Only available when
268    /// the engine is built with the `corpus-override` Cargo feature
269    /// (CLI-only — `marque-server` rejects override input on every
270    /// channel per T066, and the WASM crate cannot enable the feature
271    /// at all per T067).
272    ///
273    /// Phase 4 PR-5 minimal scope: the engine retains the override
274    /// for audit-annotation purposes only. Every subsequent decoder-
275    /// path fix produced by [`Engine::lint`] gets a
276    /// [`FeatureId::CorpusOverrideInEffect`] feature contribution
277    /// appended to its `Confidence.features` so an auditor can
278    /// identify fixes produced under organizational overrides vs.
279    /// stock priors. Substituting the override priors into the
280    /// decoder's prior-table lookup is the next-PR step.
281    #[cfg(feature = "corpus-override")]
282    #[must_use = "with_corpus_override returns a new Engine; the result must be bound to take effect — `engine.with_corpus_override(o)` alone leaves the engine without an override installed"]
283    pub fn with_corpus_override(
284        mut self,
285        override_data: std::sync::Arc<marque_config::corpus_override::CorpusOverride>,
286    ) -> Self {
287        self.corpus_override = Some(override_data);
288        self
289    }
290
291    /// Whether a corpus override is in effect for this engine.
292    ///
293    /// Returns `false` unconditionally when the `corpus-override`
294    /// Cargo feature is not compiled in — the WASM and server
295    /// builds therefore cannot observe a `true` here regardless of
296    /// what any caller passes through other surfaces. Callers that
297    /// need to thread the flag into audit-record construction (the
298    /// private `build_decoder_diagnostic` helper inside this module)
299    /// should go through this method rather than poking at the
300    /// field directly.
301    #[inline]
302    pub fn corpus_override_active(&self) -> bool {
303        #[cfg(feature = "corpus-override")]
304        {
305            self.corpus_override.is_some()
306        }
307        #[cfg(not(feature = "corpus-override"))]
308        {
309            false
310        }
311    }
312
313    /// Lint a UTF-8 text buffer. Returns diagnostics without modifying input.
314    ///
315    /// Back-compat shim over [`Engine::lint_with_options`] — calling
316    /// `lint(src)` is equivalent to
317    /// `lint_with_options(src, &LintOptions::default())`. New code that
318    /// needs a deadline (spec 005 §R3) should call the `_with_options`
319    /// variant directly.
320    pub fn lint(&self, source: &[u8]) -> LintResult {
321        self.lint_with_options(source, &LintOptions::default())
322    }
323
324    /// Lint with per-call options (spec 005 §R2).
325    ///
326    /// Phase 2 honors `opts.deadline` via cooperative cancellation
327    /// (spec §R3): a pre-pass check returns immediately on an
328    /// already-expired deadline, and a per-candidate check inside
329    /// the rule loop breaks out as soon as the deadline passes. The
330    /// returned `LintResult` carries `truncated: bool` together with
331    /// `candidates_processed` / `candidates_total` so the caller can
332    /// distinguish a complete pass from a deadline-bounded partial
333    /// pass.
334    ///
335    /// Granularity: the engine checks the deadline at candidate
336    /// boundaries (between scanner-emitted candidates), not inside
337    /// any individual rule's `check`. A pathologically slow rule
338    /// running on one large candidate can therefore overrun the
339    /// deadline by the time that one rule takes; this is the spec
340    /// §R3 trade-off — a finer-grained check inside `Rule::check`
341    /// would require a deadline-aware rule trait.
342    pub fn lint_with_options(&self, source: &[u8], opts: &LintOptions) -> LintResult {
343        use marque_core::Scanner;
344        use marque_ism::{MarkingType, PageContext};
345        use marque_rules::RuleContext;
346
347        // T007: pre-pass deadline check. An already-expired deadline
348        // returns a fully-truncated empty result before the scanner
349        // runs at all, preserving the spec invariant that the
350        // expired path is observable in zero work.
351        if deadline_expired(opts.deadline) {
352            return LintResult {
353                truncated: true,
354                ..Default::default()
355            };
356        }
357
358        let candidates = Scanner::scan(source);
359        // T009: candidates_total is fixed once the scanner has
360        // produced the candidate stream. It is independent of how
361        // many candidates the rule loop ultimately processes — the
362        // delta against `candidates_processed` is what makes
363        // truncation observable to the caller (R3). On a complete
364        // pass these are equal; on a deadline-bounded pass the
365        // function returns early from inside the loop with the
366        // partial `candidates_processed`, so the post-loop
367        // `LintResult` construction below is reached ONLY on
368        // non-truncated completion.
369        let candidates_total = candidates.len();
370        let mut candidates_processed: usize = 0;
371
372        // corrections_arc was built once at Engine construction; each clone here
373        // is an O(1) refcount bump.
374        let corrections_arc = self.corrections_arc.clone();
375
376        let mut diagnostics = Vec::new();
377        // Build page context by accumulating portion markings in document order.
378        // Banner and CAB rules receive this context so they can validate the
379        // observed banner against the expected composite. Phase 3 wires the
380        // page-break reset below — the scanner emits a `MarkingType::PageBreak`
381        // candidate at every form-feed and at every `\n\n\n+` run; on each
382        // such candidate we drop the accumulator and start a fresh page.
383        let mut page_context = PageContext::new();
384        // Cache the current Arc<PageContext> so that consecutive banner/CAB
385        // candidates on the same page share a single allocation. The cache is
386        // invalidated (set to None) whenever a new portion is accumulated or
387        // a page break resets the context.
388        let mut page_context_arc: Option<Arc<PageContext>> = None;
389
390        // FR-011: per-page strict classification floor. Tracks the
391        // highest classification rank produced by the strict path on
392        // the current page (`marque_ism::Classification as u8`,
393        // Unclassified=0 … TopSecret=4). Threaded into
394        // `ParseContext::classification_floor` so the decoder rejects
395        // any candidate at a strictly-lower level on the same page.
396        // Reset on `MarkingType::PageBreak` per Constitution VI's
397        // "PageContext resets at scanner-emitted page-break candidates"
398        // invariant. Updated *only* by classifications drawn from
399        // strict-path recognitions — decoder-recovered markings do not
400        // raise the floor for themselves (otherwise a misrecognition
401        // would self-justify by raising the floor it then clears).
402        let mut classification_floor: Option<u8> = None;
403
404        for candidate in &candidates {
405            // T008: per-candidate deadline check. Checking at the top
406            // of the loop (before any per-candidate work — including
407            // a page-break reset) guarantees the abort happens
408            // between candidates, never partway through the rule
409            // pipeline. On expiry we return immediately so the
410            // post-loop corrections-map AhoCorasick pass — which is
411            // O(source bytes) — does NOT overrun the deadline.
412            // Returning here also gives the spec-correct
413            // `truncated/processed/total` triple to the caller
414            // without falling through the rest of the function.
415            if deadline_expired(opts.deadline) {
416                return LintResult {
417                    diagnostics,
418                    truncated: true,
419                    candidates_processed,
420                    candidates_total,
421                    ..Default::default()
422                };
423            }
424
425            // T009: count every candidate the engine started
426            // processing past the deadline boundary. The increment
427            // sits ABOVE the early-`continue` paths below
428            // (page-break reset, empty span, ambiguous recognition)
429            // so a complete pass always reports
430            // `candidates_processed == candidates_total` — the
431            // documented contract for a non-truncated `LintResult`.
432            // A pass that aborts mid-loop reports `processed <
433            // total` with the count of candidates we got past the
434            // per-candidate check.
435            candidates_processed += 1;
436
437            // Page-break candidates are scanner-emitted boundaries with no
438            // parsable content. Reset the context BEFORE attempting to parse
439            // — otherwise the parser's MalformedMarking error would skip the
440            // continue and leave us accumulating across pages.
441            if candidate.kind == MarkingType::PageBreak {
442                page_context = PageContext::new();
443                page_context_arc = None;
444                classification_floor = None;
445                continue;
446            }
447
448            // Parse context built per-candidate so the floor accumulated
449            // earlier on the page reaches the recognizer. `strict_evidence
450            // = false` permits the dispatcher
451            // (`StrictOrDecoderRecognizer`, the default) to fall back to
452            // the decoder on strict-parse zero-candidate. The
453            // `StrictRecognizer` ignores this flag entirely; consumers
454            // that pin strict-only behavior install it via
455            // [`Engine::with_recognizer`].
456            //
457            // `preceded_by_whitespace` is computed against the source
458            // buffer here — the decoder receives only the candidate
459            // slice and cannot recover the surrounding context on its
460            // own. Used downstream to suppress prose-glue false
461            // positives like `letter(s)` / `loss(s)` /
462            // `function(c)`. Start-of-buffer counts as whitespace by
463            // the `ParseContext` convention.
464            let preceded_by_whitespace = match candidate.span.start.checked_sub(1) {
465                None => true,
466                Some(prev_idx) => source
467                    .get(prev_idx)
468                    .map(|b| b.is_ascii_whitespace())
469                    .unwrap_or(true),
470            };
471            let parse_cx = ParseContext {
472                strict_evidence: false,
473                zone: None,
474                position: None,
475                classification_floor,
476                as_of: None,
477                preceded_by_whitespace,
478            };
479
480            // Route each candidate's bytes through the recognizer. Zero-
481            // candidate `Ambiguous` means "no plausible interpretation" —
482            // skip, same as a strict-path parser error would in the old
483            // flow (foundational-plan line 609-612). `Unambiguous` returns
484            // a `CapcoMarking` whose `token_spans` are zero-origin relative
485            // to the candidate bytes; shift them back to source-relative
486            // offsets before rules see them.
487            let start = candidate.span.start.min(source.len());
488            let end = candidate.span.end.min(source.len());
489            if start >= end {
490                continue;
491            }
492            let bytes = &source[start..end];
493            let Parsed::Unambiguous(mut marking) = self.recognizer.recognize(bytes, &parse_cx)
494            else {
495                continue;
496            };
497            shift_token_spans(&mut marking.0, start);
498            // Capture the decoder-provenance side channel before
499            // collapsing the marking onto its `IsmAttributes` payload.
500            // Strict-path recognizers leave this `None`; the decoder
501            // populates it with the canonical bytes / posterior /
502            // features the engine needs to mint a
503            // `FixSource::DecoderPosterior` diagnostic below.
504            let provenance = marking.1.take();
505            let attrs = marking.0;
506
507            // FR-011 strict-floor accumulator: only strict-path
508            // recognitions raise the floor. A decoder-path
509            // recognition (provenance.is_some()) does not — we cannot
510            // let a probabilistic recovery self-justify by raising
511            // the threshold it then clears.
512            if provenance.is_none() {
513                if let Some(level) = attrs
514                    .classification
515                    .as_ref()
516                    .map(|c| c.effective_level() as u8)
517                {
518                    classification_floor = Some(match classification_floor {
519                        Some(prev) => prev.max(level),
520                        None => level,
521                    });
522                }
523            }
524
525            // Decoder-path emission (T068): when the recognizer carries
526            // provenance, the recognition went through the decoder
527            // fallback. Synthesize an R001 `decoder-recognition`
528            // diagnostic whose fix rewrites the original mangled bytes
529            // to the decoder's canonical form, with `FixSource::DecoderPosterior`
530            // and a populated `Confidence` (`recognition < 1.0`,
531            // `runner_up_ratio = Some(r)`, non-empty `features`). The
532            // fix participates in the regular confidence-threshold
533            // gate inside `Engine::fix_inner`.
534            if let Some(prov) = provenance {
535                let span = Span::new(start, end);
536                if let Some(diagnostic) = build_decoder_diagnostic(
537                    span,
538                    bytes,
539                    &prov,
540                    candidate.kind,
541                    self.corpus_override_active(),
542                ) {
543                    diagnostics.push(diagnostic);
544                }
545            }
546
547            // Accumulate portions before running banner/CAB rules so that
548            // when we reach a banner candidate the context already reflects
549            // all preceding portion data.
550            if candidate.kind == MarkingType::Portion {
551                page_context.add_portion(attrs.clone());
552                // Invalidate the cached Arc so the next banner/CAB gets a
553                // fresh snapshot. We rebuild it lazily below.
554                page_context_arc = None;
555            }
556
557            // Phase 3: zone and position are Option-typed and stay None
558            // until a structural scanner pass can prove them. The previous
559            // hardcoded `Zone::Body`/`DocumentPosition::Body` was a silent
560            // lie to any future rule that read them.
561            let ctx_page = if candidate.kind != MarkingType::Portion && !page_context.is_empty() {
562                // Lazily wrap the accumulated context in an Arc once per
563                // page-context snapshot; subsequent banner/CAB candidates on
564                // the same page clone only the cheap Arc pointer.
565                Some(
566                    page_context_arc
567                        .get_or_insert_with(|| Arc::new(page_context.clone()))
568                        .clone(),
569                )
570            } else {
571                None
572            };
573            let ctx = RuleContext {
574                marking_type: candidate.kind,
575                zone: None,
576                position: None,
577                page_context: ctx_page,
578                corrections: corrections_arc.clone(),
579            };
580            for rule_set in &self.rule_sets {
581                for rule in rule_set.rules() {
582                    // Skip rules that are configured as Off.
583                    let configured_severity = self
584                        .config
585                        .rules
586                        .overrides
587                        .get(rule.id().as_str())
588                        .and_then(|s| Severity::parse_config(s))
589                        .unwrap_or(rule.default_severity());
590
591                    if configured_severity == Severity::Off {
592                        continue;
593                    }
594
595                    // Whitepaper §6.3 / gap register #10: a buggy rule
596                    // that constructs an out-of-range `Confidence`
597                    // panics inside `FixProposal::new`. Without this
598                    // wrapper, that panic propagates out of `lint()`
599                    // and aborts the entire document — turning one
600                    // rule's defect into a service outage. Catch the
601                    // unwind, log a warning naming the rule, and
602                    // skip it. Other rules and other candidates keep
603                    // running.
604                    //
605                    // `AssertUnwindSafe` is a deliberate best-effort
606                    // containment — `Send + Sync` (which `Rule`
607                    // requires) is NOT the same property as
608                    // `UnwindSafe`. The justification rests on the
609                    // engine's stateless-rule contract
610                    // (`crates/rules/src/lib.rs` `Rule` doc comments):
611                    // `check()` must not mutate state visible across
612                    // invocations. A rule that violates that contract
613                    // via interior mutability could in principle
614                    // observe a torn invariant after a panic — but the
615                    // alternative is to abort the whole `lint()` on
616                    // any rule defect, which is the bug this wrapper
617                    // exists to fix. Containing the failure to the
618                    // offending rule is strictly better than letting
619                    // it cascade. Diagnostics we'd otherwise have
620                    // appended on success are built fresh inside the
621                    // closure, so they don't pollute the outer
622                    // accumulator on the panic path.
623                    //
624                    // Requires `panic = "unwind"` in the release
625                    // profile (`Cargo.toml`). With `panic = "abort"`
626                    // the panic terminates the process before this
627                    // catch can fire.
628                    let rule_id = rule.id();
629                    let catch_result =
630                        std::panic::catch_unwind(AssertUnwindSafe(|| rule.check(&attrs, &ctx)));
631                    let mut diags = match catch_result {
632                        Ok(d) => d,
633                        Err(payload) => {
634                            let msg = panic_payload_to_string(&payload);
635                            tracing::warn!(
636                                target: "marque_engine::rule_panic",
637                                rule = rule_id.as_str(),
638                                error = %msg,
639                                "rule check panicked; skipping this rule for the current candidate"
640                            );
641                            Vec::new()
642                        }
643                    };
644                    // Apply configured severity override.
645                    for d in &mut diags {
646                        d.severity = configured_severity;
647                    }
648                    diagnostics.extend(diags);
649                }
650            }
651        }
652
653        // Pre-scanner text corrections: scan the raw source for
654        // corrections-map keys that the scanner missed (e.g., "SERCET" is
655        // not a known classification prefix, so the scanner never detects
656        // "SERCET//NF" as a candidate, and C001 never sees the token).
657        //
658        // This pass emits C001 diagnostics for raw-text matches that don't
659        // overlap with any C001 diagnostic already produced by the rule
660        // pipeline above. Spans reference the original source buffer.
661        if let Some(cached) = &self.corrections_ac {
662            let c001_severity = self
663                .config
664                .rules
665                .overrides
666                .get("C001")
667                .and_then(|s| Severity::parse_config(s))
668                .unwrap_or(Severity::Fix);
669
670            if c001_severity != Severity::Off {
671                // Collect spans already covered by rule-pipeline C001.
672                let existing_c001_spans: std::collections::HashSet<Span> = diagnostics
673                    .iter()
674                    .filter(|d| d.rule.as_str() == "C001")
675                    .map(|d| d.span)
676                    .collect();
677
678                // Use the pre-built AhoCorasick automaton to scan the full
679                // source in a single O(n + m) pass. The automaton and its
680                // active pairs were built once at Engine construction time.
681                for mat in cached.ac.find_iter(source) {
682                    let span = Span::new(mat.start(), mat.end());
683                    let (ref key, ref value) = cached.active[mat.pattern().as_usize()];
684
685                    // Skip if the rule pipeline already produced a C001
686                    // diagnostic for this exact span.
687                    if !existing_c001_spans.contains(&span) {
688                        let proposal = FixProposal::new(
689                            RuleId::new("C001"),
690                            FixSource::CorrectionsMap,
691                            span,
692                            key.as_ref(),
693                            value.as_ref(),
694                            marque_rules::Confidence::strict(1.0),
695                            None,
696                        );
697                        diagnostics.push(Diagnostic::new(
698                            RuleId::new("C001"),
699                            c001_severity,
700                            span,
701                            format!("corrections map: {key:?} → {value:?}"),
702                            CORRECTIONS_MAP_CITATION,
703                            Some(proposal),
704                        ));
705                    }
706                }
707            }
708        }
709
710        // Suggest-don't-fix channel post-pass (issue #235 / #186 PR-3).
711        //
712        // Only `Severity::Fix` diagnostics are rewritten — those are
713        // the ones whose authoring rule expects auto-application. A
714        // sub-threshold `FixProposal` attached to a `Fix`-severity
715        // diagnostic stays observable in lint output by being
716        // demoted to `Severity::Suggest` instead of being silently
717        // dropped at the fix-collection threshold gate.
718        //
719        // Error/Warn/Info rules with sub-threshold fixes keep their
720        // severity (the violation IS what the rule says it is; only
721        // the suggested replacement is uncertain) and the fix is
722        // silently dropped at the apply gate as before. Suggest-channel
723        // reuse for Error/Warn fixes is out of scope for PR-C — making
724        // a normative ordering rule like E003 CI-silent because its
725        // fix confidence sits below threshold would be a behavioral
726        // regression.
727        //
728        // This unifies two emission paths into a single visible
729        // channel for `Fix`-severity rules:
730        //
731        //   - Rules that explicitly emit at `Severity::Suggest`
732        //     (e.g., `S004 rel-to-trigraph-suggest`).
733        //   - `Fix`-severity rules whose proposal confidence falls
734        //     below the configured threshold (decoder-sourced fixes
735        //     that didn't quite clear the bar are the canonical case).
736        //
737        // The fix stays attached because the renderer surfaces the
738        // candidate replacement; only the severity is changed. The
739        // constitutional V audit-content-ignorance invariant is
740        // preserved — no fields are modified except `severity`,
741        // which is metadata not document content.
742        //
743        // `Engine::fix_inner` re-applies the threshold gate on its own
744        // (and now also filters by `severity != Suggest`), so a
745        // diagnostic rewritten here will not be promoted to an
746        // `AppliedFix` even if a later threshold-override raises the
747        // floor.
748        let threshold = self.config.confidence_threshold();
749        for d in &mut diagnostics {
750            if d.severity != Severity::Fix {
751                continue;
752            }
753            let Some(fix) = d.fix.as_ref() else { continue };
754            if fix.confidence.combined() < threshold {
755                d.severity = Severity::Suggest;
756            }
757        }
758
759        LintResult {
760            diagnostics,
761            truncated: false,
762            candidates_processed,
763            candidates_total,
764            ..Default::default()
765        }
766    }
767
768    /// Lint and apply fixes. Returns fixed source and audit log.
769    ///
770    /// Fix application order follows FR-016: `(span.end DESC, span.start DESC,
771    /// rule_id ASC, replacement ASC)` so reverse-byte application preserves
772    /// earlier-span offsets and equal-span ties break deterministically.
773    ///
774    /// Uses the confidence threshold configured in the engine's `Config`.
775    /// To supply a per-call override (e.g., from a `--confidence` CLI flag
776    /// or an HTTP request field), use [`Engine::fix_with_threshold`] or
777    /// [`Engine::fix_with_options`].
778    ///
779    /// Back-compat shim over [`Engine::fix_with_options`] — `fix(src, mode)`
780    /// is equivalent to `fix_with_options(src, mode, &FixOptions::default())`
781    /// (no deadline, no threshold override). Both invariants make the
782    /// `expect` here unreachable: the default options carry no deadline so
783    /// `EngineError::DeadlineExceeded` cannot fire, and the config
784    /// threshold is pre-validated at load time so
785    /// `EngineError::InvalidThreshold` cannot fire.
786    pub fn fix(&self, source: &[u8], mode: FixMode) -> FixResult {
787        self.fix_with_options(source, mode, &FixOptions::default())
788            .expect(
789                "fix() default options cannot fail: no deadline + pre-validated config threshold",
790            )
791    }
792
793    /// Lint and apply fixes using an optional per-call confidence threshold.
794    ///
795    /// When `threshold_override` is `Some`, it replaces the config-level
796    /// threshold for this call only and is validated against `[0.0, 1.0]`.
797    /// When `None`, the engine falls back to `Config::confidence_threshold`.
798    ///
799    /// This signature is preserved for back-compat. New callers should
800    /// prefer [`Engine::fix_with_options`], which carries the deadline
801    /// surface alongside the threshold override.
802    pub fn fix_with_threshold(
803        &self,
804        source: &[u8],
805        mode: FixMode,
806        threshold_override: Option<f32>,
807    ) -> Result<FixResult, InvalidThreshold> {
808        let opts = FixOptions {
809            threshold_override,
810            ..Default::default()
811        };
812        match self.fix_with_options(source, mode, &opts) {
813            Ok(result) => Ok(result),
814            Err(EngineError::InvalidThreshold(it)) => Err(it),
815            // No caller can reach this arm: `fix_with_threshold`'s
816            // public signature does not accept a deadline, so the
817            // `FixOptions` we built above has `deadline: None`. A
818            // future signature change that introduces one would have
819            // to remove this `unreachable!` deliberately.
820            Err(EngineError::DeadlineExceeded { .. }) => {
821                unreachable!("fix_with_threshold cannot set a deadline through its signature")
822            }
823        }
824    }
825
826    /// Lint and apply fixes with per-call options (spec 005 §R2).
827    ///
828    /// Phase 2 honors `opts.deadline` via cooperative cancellation
829    /// (spec §R3). Asymmetric response per §R4 / Constitution V
830    /// Principle V (audit-record integrity): a deadline expiring at
831    /// any point during the fix path returns
832    /// `Err(EngineError::DeadlineExceeded { partial_lint })` rather
833    /// than a partial `FixResult`. The `partial_lint` carries
834    /// whatever the lint phase had produced before the deadline
835    /// fired (or a fully-truncated lint when the deadline was
836    /// already expired on entry); no half-applied fix is ever
837    /// emitted into the audit stream.
838    ///
839    /// `opts.threshold_override` is honored from Phase 1 onward; an
840    /// out-of-range / NaN value is rejected as
841    /// `EngineError::InvalidThreshold` before any work runs.
842    pub fn fix_with_options(
843        &self,
844        source: &[u8],
845        mode: FixMode,
846        opts: &FixOptions,
847    ) -> Result<FixResult, EngineError> {
848        let threshold = match opts.threshold_override {
849            Some(value) => {
850                if !(0.0..=1.0).contains(&value) || value.is_nan() {
851                    return Err(EngineError::InvalidThreshold(InvalidThreshold(value)));
852                }
853                value
854            }
855            None => self.config.confidence_threshold(),
856        };
857
858        self.fix_inner(source, mode, threshold, opts.deadline)
859    }
860
861    fn fix_inner(
862        &self,
863        source: &[u8],
864        mode: FixMode,
865        threshold: f32,
866        deadline: Option<Instant>,
867    ) -> Result<FixResult, EngineError> {
868        use std::collections::HashSet;
869
870        // Two-pass fix strategy for pre-scanner text corrections.
871        //
872        // Pass 1: lint the original source. The pre-scanner text scan may
873        // produce C001 diagnostics for corrections-map matches the scanner
874        // missed (e.g., "SERCET" is not a known classification prefix).
875        // Apply those C001 fixes to produce an intermediate source.
876        //
877        // Pass 2: re-lint the intermediate source. The scanner now detects
878        // the corrected marking (e.g., "SECRET//NF") and additional rules
879        // fire (e.g., E001 on NF→NOFORN). Apply those fixes on top.
880        //
881        // Without this, the spec scenario "SERCET//NF → SECRET//NOFORN"
882        // would stop at "SECRET//NF".
883        //
884        // T010: deadline propagates to every internal lint pass. An
885        // expired deadline at lint time produces a truncated lint, and
886        // the post-lint check below converts that into the asymmetric
887        // `Err(DeadlineExceeded { partial_lint })` shape per spec §R4
888        // (Constitution V Principle V — no partial `FixResult` leaks
889        // into the audit stream).
890        let lint_opts = LintOptions {
891            deadline,
892            ..Default::default()
893        };
894        let lint1 = self.lint_with_options(source, &lint_opts);
895        if deadline_expired(deadline) {
896            return Err(EngineError::DeadlineExceeded {
897                partial_lint: lint1,
898            });
899        }
900        let (effective_source, pass1_applied) =
901            self.apply_text_corrections(source, &lint1, threshold, mode);
902
903        let lint = if !pass1_applied.is_empty() {
904            // Re-lint the corrected source so the scanner picks up newly-valid markings.
905            self.lint_with_options(&effective_source, &lint_opts)
906        } else {
907            lint1
908        };
909
910        // Post-lint deadline check: if the deadline expired during
911        // either pass-1 or pass-2 lint (or during text-correction
912        // application between them), bail out before building any
913        // fix entries. `partial_lint` carries whatever the lint phase
914        // produced — including `truncated: true` when applicable.
915        if deadline_expired(deadline) {
916            return Err(EngineError::DeadlineExceeded { partial_lint: lint });
917        }
918
919        // Suggest-don't-fix channel: `Severity::Suggest` is a hard
920        // exclusion from auto-apply by construction. The lint
921        // post-pass already rewrites below-threshold proposals to
922        // `Suggest`, but explicit `Suggest` rules (e.g., S004) can
923        // also emit fixes that clear the threshold yet must NOT be
924        // applied. This filter handles both cases uniformly.
925        let mut fixes: Vec<_> = lint
926            .diagnostics
927            .iter()
928            .filter(|d| d.severity != Severity::Suggest)
929            .filter_map(|d| d.fix.as_ref())
930            .filter(|f| f.confidence.combined() >= threshold)
931            .filter(|f| !f.span.is_empty())
932            .collect();
933
934        // FR-016: deterministic total-order fix application.
935        // Sort by (span.end DESC, span.start DESC, rule_id ASC, replacement ASC).
936        fixes.sort_by(|a, b| {
937            b.span
938                .end
939                .cmp(&a.span.end)
940                .then(b.span.start.cmp(&a.span.start))
941                .then(a.rule.cmp(&b.rule))
942                .then(a.replacement.cmp(&b.replacement))
943        });
944
945        // C-1: overlap guard. After the FR-016 sort, two fixes can still
946        // touch the same byte range if multiple rules emit a fix for the
947        // same span (or overlapping spans). Applying both via `splice`
948        // would silently corrupt the byte stream. We keep the first fix
949        // per span (which under FR-016 ordering is deterministic) and
950        // surface the dropped fixes through `remaining_diagnostics`.
951        //
952        // The walk is over fixes in reverse-end order, so a fix is kept
953        // only if its `span.end` is at or below the previous kept fix's
954        // `span.start` — i.e., strictly to the left, no overlap.
955        // Clone the kept fixes into owned `FixProposal`s so the
956        // borrow on `lint.diagnostics` ends with `fixes`. That
957        // matters for T011: the per-fix deadline-bail path needs to
958        // move `lint` into `EngineError::DeadlineExceeded`, which is
959        // only legal once nothing inside the body still references
960        // it. The clone count is bounded by the number of kept
961        // fixes (after the C-1 dedup), which is small in practice.
962        let mut kept_fixes: Vec<FixProposal> = Vec::with_capacity(fixes.len());
963        let mut next_window_end: Option<usize> = None;
964        for fix in &fixes {
965            let fits = match next_window_end {
966                Some(boundary) => fix.span.end <= boundary,
967                None => true,
968            };
969            if fits {
970                next_window_end = Some(fix.span.start);
971                kept_fixes.push((*fix).clone());
972            }
973        }
974        drop(fixes); // release the iter borrow on `lint.diagnostics`
975
976        // M-4: hold the classifier id in an `Arc<str>` so cloning into each
977        // applied-fix audit record is an O(1) refcount bump rather than a
978        // full string copy per fix.
979        let classifier_id: Option<std::sync::Arc<str>> = self
980            .config
981            .user
982            .classifier_id
983            .as_deref()
984            .map(std::sync::Arc::from);
985        let dry_run = mode == FixMode::DryRun;
986        let now = self.clock.now();
987
988        // H-7: applied-fix lookup is keyed by (RuleId, Span). Use a HashSet
989        // so the per-diagnostic filter at the bottom of this function is
990        // O(1) per query instead of O(n) over a Vec.
991        let mut applied_keys: HashSet<(RuleId, Span)> = HashSet::with_capacity(kept_fixes.len());
992        let mut applied: Vec<AppliedFix> = Vec::with_capacity(kept_fixes.len());
993
994        // T011: per-fix-application deadline check. The check sits
995        // at the top of each iteration so the abort happens between
996        // fixes — the audit-record integrity invariant
997        // (Constitution V Principle V) is preserved because we
998        // never construct a half-applied `FixResult`. If a fix has
999        // already been applied to `buf` and `applied`, we drop both
1000        // and surface the asymmetric `Err(DeadlineExceeded)` shape;
1001        // the partial buffer is intentionally discarded so no
1002        // partially-fixed bytes can leak to a caller.
1003        //
1004        // Pre-apply check: catch a deadline that expired during
1005        // fix collection / sort / dedup BEFORE we clone
1006        // `effective_source` into `buf` (which is O(source bytes)
1007        // and pointless if we're about to drop the buffer on the
1008        // floor). On large inputs the clone alone can be the
1009        // dominant cost; the post-lint check above doesn't cover
1010        // it because the sort + dedup phase between the two adds
1011        // its own latency on documents with many fixes.
1012        if deadline_expired(deadline) {
1013            return Err(EngineError::DeadlineExceeded { partial_lint: lint });
1014        }
1015
1016        // Only allocate the output buffer when we actually need to
1017        // mutate it. Dry-run returns the original source verbatim.
1018        let mut deadline_aborted = false;
1019        let output = match mode {
1020            FixMode::Apply => {
1021                // Forward-pass buffer construction: O(source_len + Σ replacement_lens).
1022                //
1023                // `kept_fixes` is in (span.end DESC, span.start DESC) order
1024                // from the FR-016 sort (line ~936) and C-1 dedup walk.
1025                // Iterating in reverse gives ascending span.end / span.start
1026                // order so we can copy each gap and replacement in a single
1027                // left-to-right pass over `effective_source`.
1028                //
1029                // This replaces the previous `Vec::splice`-per-fix approach
1030                // that was O(N × M): each splice shifted every byte after the
1031                // splice point, so N evenly-spaced fixes on an M-byte buffer
1032                // cost O(N × M / 2) total — quadratic when fix density scales
1033                // with document size.
1034                //
1035                // After C-1 has guaranteed `kept_fixes` is non-overlapping in
1036                // reverse-end order, ascending order is also non-overlapping
1037                // (the property does not depend on traversal direction), so the
1038                // forward walk is safe.
1039                let extra: usize = kept_fixes
1040                    .iter()
1041                    .map(|f| {
1042                        // `saturating_sub` gives the per-fix growth contribution
1043                        // (0 when the replacement is shorter than the span).
1044                        // The result is an upper-bound preallocation: fixes that
1045                        // shrink the buffer contribute 0 here, so the true net
1046                        // change may be smaller. This is intentional — it avoids
1047                        // the sign-handling complexity of a true net delta while
1048                        // still preventing the O(log N) reallocation cascade that
1049                        // would occur for repeated grow-by-one insertions.
1050                        f.replacement
1051                            .len()
1052                            .saturating_sub(f.span.end - f.span.start)
1053                    })
1054                    .sum();
1055                let mut buf = Vec::with_capacity(effective_source.len() + extra);
1056                let mut last_end = 0usize;
1057                for fix in kept_fixes.iter().rev() {
1058                    if deadline_expired(deadline) {
1059                        deadline_aborted = true;
1060                        break;
1061                    }
1062                    buf.extend_from_slice(&effective_source[last_end..fix.span.start]);
1063                    buf.extend_from_slice(fix.replacement.as_bytes());
1064                    last_end = fix.span.end;
1065                }
1066                if !deadline_aborted {
1067                    // Append the tail after the last fix (or the full source if
1068                    // there were no fixes).
1069                    buf.extend_from_slice(&effective_source[last_end..]);
1070                }
1071                // Audit records: original descending order, matching DryRun so
1072                // the two modes produce identical `applied` orderings.
1073                if !deadline_aborted {
1074                    for fix in kept_fixes {
1075                        if deadline_expired(deadline) {
1076                            deadline_aborted = true;
1077                            break;
1078                        }
1079                        applied_keys.insert((fix.rule.clone(), fix.span));
1080                        applied.push(AppliedFix::__engine_promote(
1081                            fix,
1082                            now,
1083                            classifier_id.clone(),
1084                            dry_run,
1085                            None, // input identifier set by CLI at the boundary
1086                            engine_promotion_token(),
1087                        ));
1088                    }
1089                }
1090                buf
1091            }
1092            FixMode::DryRun => {
1093                for fix in kept_fixes {
1094                    if deadline_expired(deadline) {
1095                        deadline_aborted = true;
1096                        break;
1097                    }
1098                    applied_keys.insert((fix.rule.clone(), fix.span));
1099                    applied.push(AppliedFix::__engine_promote(
1100                        fix,
1101                        now,
1102                        classifier_id.clone(),
1103                        dry_run,
1104                        None,
1105                        engine_promotion_token(),
1106                    ));
1107                }
1108                source.to_vec()
1109            }
1110        };
1111
1112        if deadline_aborted {
1113            // `partial_lint` carries the full diagnostics produced by
1114            // the lint phase that completed before the apply loop ran.
1115            // The apply loop ran partially; per Constitution V
1116            // Principle V, that partial state is dropped on the floor
1117            // and the caller sees only the lint result. Pass-1 text
1118            // corrections that were applied are also discarded — the
1119            // audit stream gets nothing from this call.
1120            return Err(EngineError::DeadlineExceeded { partial_lint: lint });
1121        }
1122
1123        // Prepend pass-1 text corrections to the applied list so they
1124        // appear in the audit trail.
1125        let mut all_applied = pass1_applied;
1126        all_applied.extend(applied);
1127
1128        // Remaining diagnostics: those whose fix was not applied.
1129        // Filter by (rule_id, span) pair — not just rule ID — so that if
1130        // rule E001 fires on three spans and only one is fixed, the other
1131        // two remain.
1132        let remaining_diagnostics = lint
1133            .diagnostics
1134            .into_iter()
1135            .filter(|d| {
1136                !d.fix
1137                    .as_ref()
1138                    .is_some_and(|f| applied_keys.contains(&(f.rule.clone(), f.span)))
1139            })
1140            .collect();
1141
1142        Ok(FixResult {
1143            source: output,
1144            applied: all_applied,
1145            remaining_diagnostics,
1146        })
1147    }
1148
1149    /// Apply pre-scanner text corrections (C001) from lint diagnostics and
1150    /// return the corrected source + applied fixes. Used by `fix_inner` to
1151    /// produce an intermediate source that the scanner can detect.
1152    fn apply_text_corrections(
1153        &self,
1154        source: &[u8],
1155        lint: &LintResult,
1156        threshold: f32,
1157        mode: FixMode,
1158    ) -> (Vec<u8>, Vec<AppliedFix>) {
1159        // Mirror `fix_inner`'s suggest-channel exclusion: a C001
1160        // diagnostic that the lint post-pass rewrote to
1161        // `Severity::Suggest` (because its confidence fell below
1162        // threshold) must not be auto-applied here either.
1163        let mut text_fixes: Vec<&FixProposal> = lint
1164            .diagnostics
1165            .iter()
1166            .filter(|d| d.rule.as_str() == "C001")
1167            .filter(|d| d.severity != Severity::Suggest)
1168            .filter_map(|d| d.fix.as_ref())
1169            .filter(|f| f.source == FixSource::CorrectionsMap)
1170            .filter(|f| f.confidence.combined() >= threshold)
1171            .filter(|f| !f.span.is_empty())
1172            .collect();
1173
1174        if text_fixes.is_empty() {
1175            return (source.to_vec(), Vec::new());
1176        }
1177
1178        // Sort and deduplicate using FR-016 order + C-1 overlap guard.
1179        text_fixes.sort_by(|a, b| {
1180            b.span
1181                .end
1182                .cmp(&a.span.end)
1183                .then(b.span.start.cmp(&a.span.start))
1184                .then(a.rule.cmp(&b.rule))
1185                .then(a.replacement.cmp(&b.replacement))
1186        });
1187        let mut kept: Vec<&FixProposal> = Vec::new();
1188        let mut next_end: Option<usize> = None;
1189        for fix in &text_fixes {
1190            let fits = next_end.is_none_or(|b| fix.span.end <= b);
1191            if fits {
1192                next_end = Some(fix.span.start);
1193                kept.push(*fix);
1194            }
1195        }
1196
1197        let classifier_id: Option<Arc<str>> =
1198            self.config.user.classifier_id.as_deref().map(Arc::from);
1199        let dry_run = mode == FixMode::DryRun;
1200        let now = self.clock.now();
1201
1202        // Always apply text corrections to the intermediate buffer, even in
1203        // DryRun mode. This buffer is internal — pass 2 needs it to re-lint
1204        // corrected text so downstream rules fire (e.g., E001 on NF after
1205        // SERCET→SECRET). The final output for DryRun returns the original
1206        // source in fix_inner, not this intermediate buffer.
1207        let mut buf = source.to_vec();
1208        let mut applied = Vec::with_capacity(kept.len());
1209        for fix in &kept {
1210            buf.splice(fix.span.start..fix.span.end, fix.replacement.bytes());
1211            applied.push(AppliedFix::__engine_promote(
1212                (*fix).clone(),
1213                now,
1214                classifier_id.clone(),
1215                dry_run,
1216                None,
1217                engine_promotion_token(),
1218            ));
1219        }
1220
1221        (buf, applied)
1222    }
1223}
1224
1225// ---------------------------------------------------------------------------
1226// Engine-only AppliedFix promotion gate (Constitution V Principle V)
1227// ---------------------------------------------------------------------------
1228
1229/// Mint an [`EnginePromotionToken`] for [`AppliedFix::__engine_promote`].
1230///
1231/// This is the **single** place inside `marque-engine` where the engine
1232/// grants itself the privilege to promote a `FixProposal` to an
1233/// `AppliedFix`. Constitution V Principle V scopes audit-record
1234/// promotion to `Engine::fix_inner` and `Engine::apply_text_corrections`
1235/// (the three production call sites in this file). Centralizing the
1236/// token construction here makes "where does the engine decide to
1237/// promote?" a one-grep question, and means a future refactor that
1238/// adds a fourth promotion site has to thread through this function
1239/// — a deliberate decision, not an accident.
1240///
1241/// `EnginePromotionToken`'s sole field is private to `marque-rules`,
1242/// so external crates cannot brace-construct one. The
1243/// `__engine_construct` constructor on the token is `#[doc(hidden)]`
1244/// and named to make its intent unmistakable to anyone reading a call
1245/// site outside the engine.
1246#[inline]
1247fn engine_promotion_token() -> EnginePromotionToken {
1248    EnginePromotionToken::__engine_construct()
1249}
1250
1251// ---------------------------------------------------------------------------
1252// Decoder-path diagnostic synthesis (Phase 4 PR-4b — T068)
1253// ---------------------------------------------------------------------------
1254
1255/// Build the synthetic `R001 decoder-recognition` diagnostic the engine
1256/// emits when a recognizer returned a marking carrying
1257/// [`DecoderProvenance`]. Returns `None` when the original or canonical
1258/// bytes are not valid UTF-8 — `FixProposal` carries `Box<str>` for both
1259/// `original` and `replacement`, so we cannot construct the proposal
1260/// without UTF-8 validity. CAPCO markings are ASCII by spec (CAPCO-2016
1261/// §A.6); a non-UTF-8 result here would mean the canonicalization pass
1262/// produced something the strict parser shouldn't have accepted, which
1263/// is a separate bug to surface — silently dropping the synthetic
1264/// diagnostic is the conservative move.
1265///
1266/// # Audit-shape contract (Constitution V Principle V / G13)
1267///
1268/// The diagnostic's `message` and the synthesized `FixProposal.original`
1269/// MUST NOT carry verbatim input bytes — only token canonicals, span
1270/// offsets, and digests/posterior scalars are permitted in audit
1271/// output. The "before" form is therefore omitted from the message
1272/// and `proposal.original` is set to the empty string for
1273/// decoder-path R001 records: span tells the audit consumer *where*
1274/// the fix landed, `proposal.replacement` carries *what* it became.
1275/// The original bytes already exist in the source document; the audit
1276/// record is not the right channel for them.
1277///
1278/// Note: this contract addresses the audit-record *shape*. A separate
1279/// upstream concern is whether `proposal.replacement` itself is a
1280/// well-formed canonical (Constitution V permits "token canonicals"
1281/// in audit output). When the decoder accepts unrecognized bytes as a
1282/// compartment-shaped token and uppercases them, the resulting
1283/// "canonical" carries those bytes through `replacement` — that's a
1284/// decoder-correctness issue to address separately.
1285///
1286/// The fix's `Confidence` is populated entirely from the decoder's
1287/// provenance trace:
1288///
1289/// - `recognition` derives from `runner_up_ratio` via softmax (see
1290///   [`DecoderProvenance::recognition_score`]); strictly less than
1291///   `1.0` so audit consumers can distinguish strict from decoder
1292///   provenance via a single field comparison.
1293/// - `rule` is `1.0` — once the decoder has decided unambiguously the
1294///   recognition-layer rewrite is itself unambiguous (rewrite the
1295///   observed bytes to canonical bytes), so the rule axis carries no
1296///   additional uncertainty. The decoder's recognition uncertainty is
1297///   already captured in `recognition`.
1298/// - `runner_up_ratio` and `features` thread through verbatim from the
1299///   provenance.
1300/// - When `corpus_override_active` is `true`, an extra
1301///   [`FeatureId::CorpusOverrideInEffect`] contribution with
1302///   `delta = 0.0` is appended to `features`. The zero delta is
1303///   load-bearing: PR-5 minimal scope wires the surface end-to-end
1304///   without yet substituting override priors into decoder scoring,
1305///   so the contribution is purely an audit-trail marker
1306///   ("this fix was produced under organizational overrides")
1307///   rather than an actual posterior shift. A future PR that wires
1308///   override-prior substitution will replace `0.0` with the real
1309///   delta and re-version the audit schema.
1310fn build_decoder_diagnostic(
1311    span: Span,
1312    original_bytes: &[u8],
1313    provenance: &DecoderProvenance,
1314    _kind: marque_ism::MarkingType,
1315    corpus_override_active: bool,
1316) -> Option<Diagnostic> {
1317    use marque_rules::confidence::{FeatureContribution, FeatureId};
1318
1319    let original = std::str::from_utf8(original_bytes).ok()?;
1320    let replacement = std::str::from_utf8(&provenance.canonical_bytes).ok()?;
1321
1322    // No-op rewrite (canonicalization preserved bytes byte-for-byte) is
1323    // not informative and would produce a degenerate audit record; skip.
1324    if original == replacement {
1325        return None;
1326    }
1327
1328    let mut features: Vec<FeatureContribution> = provenance.features.to_vec();
1329    if corpus_override_active {
1330        features.push(FeatureContribution {
1331            id: FeatureId::CorpusOverrideInEffect,
1332            delta: 0.0,
1333        });
1334    }
1335
1336    // Dispatch on the decoder's `fix_source`. Standard vocab-based
1337    // recognition emits at `Severity::Fix` with `rule = 1.0` (engine
1338    // applies whenever `recognition >= confidence_threshold`). The
1339    // position-aware classification heuristic (issue #133 PR 2) emits
1340    // at `Severity::Warn` (always-visible in `--check`, non-zero exit
1341    // code) with `rule = HEURISTIC_RULE_AXIS_CAP = 0.95` matching the
1342    // default `confidence_threshold`. PR 4's empirical corpus
1343    // measurement justifies the `0.95` value — see the cap's doc
1344    // comment for the analysis script and measured numbers.
1345    let (severity, rule_axis, fix_source) = match provenance.fix_source {
1346        FixSource::DecoderClassificationHeuristic => (
1347            Severity::Warn,
1348            HEURISTIC_RULE_AXIS_CAP,
1349            FixSource::DecoderClassificationHeuristic,
1350        ),
1351        // All non-heuristic decoder paths use the existing posterior
1352        // shape. Strict-source variants (BuiltinRule, CorrectionsMap,
1353        // MigrationTable) do not flow through this builder — they
1354        // come from rule-pipeline emissions, not the decoder — so
1355        // routing them to `DecoderPosterior` here is a defensive
1356        // default that preserves the existing strict-decoder shape
1357        // for any future fix-source variant.
1358        _ => (Severity::Fix, 1.0, FixSource::DecoderPosterior),
1359    };
1360
1361    let confidence = Confidence {
1362        recognition: provenance.recognition_score(),
1363        rule: rule_axis,
1364        region: None,
1365        runner_up_ratio: provenance.runner_up_ratio,
1366        features,
1367    };
1368    let rule = RuleId::new(DECODER_RULE_ID);
1369    // Audit-shape contract: `proposal.original` is the empty string for
1370    // decoder-path R001 records (Constitution V Principle V / G13). The
1371    // span identifies *where* the fix landed; the bytes are still in
1372    // the source document. The unused `original` binding documents that
1373    // we held UTF-8 validity for the input but intentionally do not
1374    // route it into the audit record.
1375    let _ = original;
1376    let proposal = FixProposal::new(
1377        rule.clone(),
1378        fix_source,
1379        span,
1380        "",
1381        replacement,
1382        confidence,
1383        None,
1384    );
1385    Some(Diagnostic::new(
1386        rule,
1387        severity,
1388        span,
1389        format!("decoder-recognized canonical form: {replacement:?}"),
1390        DECODER_CITATION,
1391        Some(proposal),
1392    ))
1393}
1394
1395/// `Confidence::rule` cap for the position-aware classification
1396/// heuristic (`FixSource::DecoderClassificationHeuristic`). Pinned
1397/// at `0.95` matching the default `confidence_threshold` — solo-
1398/// candidate heuristic fixes auto-apply at the default threshold;
1399/// multi-candidate cases (heuristic plus a competing recovery)
1400/// drop below `0.95` because `recognition` falls with the runner-
1401/// up margin and the user retains agency to verify. The diagnostic
1402/// is always emitted at [`Severity::Warn`](marque_rules::Severity::Warn)
1403/// regardless of confidence, so `--check` exits non-zero whenever
1404/// the heuristic fires.
1405///
1406/// # Empirical justification (issue #133 PR 4)
1407///
1408/// The relevant FP rate isn't "trigger appears in arbitrary prose"
1409/// but "trigger appears as a standalone token in a context that
1410/// also contains marking-shape signals (`//` outside URLs, or any
1411/// CAPCO marking long-form like `NOFORN`/`SECRET`/`REL TO`/etc.)
1412/// within proximity" — because the decoder heuristic only fires
1413/// when the strict parse fails on input that's already
1414/// marking-shaped. PR 2's initial guess of `0.80` was based on the
1415/// reading "we can't be 97% sure"; PR 4 measured the conditional
1416/// FP rate against the full Enron corpus and confirmed the
1417/// in-context heuristic is well-calibrated above `0.95`.
1418///
1419/// Headline numbers from the committed evidence file
1420/// (`tools/corpus-analysis/output/heuristic_frequencies.json`,
1421/// case-insensitive scan over 510,596 Enron documents — case-
1422/// insensitive because the decoder uppercases inputs before running
1423/// the heuristic, so a runtime-faithful measurement must capture
1424/// lowercase trigger appearances too):
1425///
1426/// - **11 of 37 triggers** have zero marking-context hits across
1427///   the corpus (the case-sensitive prior measurement reported
1428///   23/37, but those numbers undercounted the runtime distribution).
1429/// - The worst-case per-occurrence in-context rate is `V` at
1430///   814/23,331 ≈ 3.49% (`V`→`C` heuristic). Interpreted as "of
1431///   every 100 standalone `V` tokens in body text, ~3.5 sit
1432///   within ~30 chars of a marking-shape signal." Corresponds to
1433///   ~96.5% per-occurrence precision — still above the 0.95 cap,
1434///   though with thinner headroom than the prior measurement
1435///   showed.
1436/// - Most other non-zero triggers stay below ~1.5% per-occurrence
1437///   (A: 0.15%, E: 0.34%, RE: 0.19%, W: 0.94%, F: 0.50%, etc.).
1438///
1439/// **Cap calibration**: the 0.95 cap is justified by the measured
1440/// per-occurrence in-context rates above. Two prior framings of
1441/// this paragraph (a "5,000-file sample" with hand-derived numbers
1442/// and a "Bayesian credible upper bound ≥ 99.94%" calculation) were
1443/// dropped because (a) the sample numbers were superseded by the
1444/// full-corpus measurement, and (b) the Bayesian calculation used
1445/// a different denominator (`marking_context / total_docs`) than
1446/// the per-occurrence rate (`marking_context / unrestricted`),
1447/// making them not directly comparable. Use the measured per-
1448/// occurrence rates directly.
1449///
1450/// **Important caveat — loose upper bound**: the per-occurrence rate
1451/// is an UPPER BOUND on the heuristic's true FP rate, not the rate
1452/// itself. The metric counts "trigger token appears within ~30 chars
1453/// of a marking signal," which catches every potential heuristic-
1454/// fire input but ALSO includes many that the
1455/// [`try_classification_heuristic_fix`](crate::decoder)
1456/// guards (lone-input check, leading-position requirement,
1457/// multi-token-after-leading-position requirement) would filter out
1458/// before the heuristic ever fires. The true FP rate is likely well
1459/// below the worst-case 3.49% bound — but if real-world deployment
1460/// shows V-shaped triggers producing too many false positives, the
1461/// per-trigger plumbing originally proposed for PR 4 should land
1462/// (skip-list V, drop its rule confidence, etc.).
1463///
1464/// Spot-check the evidence file for per-trigger detail; this doc
1465/// summarizes qualitatively to avoid drift if the file is
1466/// regenerated against a different corpus.
1467///
1468/// To re-measure (e.g., when a different corpus is added):
1469///
1470/// ```text
1471/// python3 tools/corpus-analysis/analyze.py \
1472///     --mode heuristic-frequency \
1473///     --output tools/corpus-analysis/output/heuristic_frequencies.json
1474/// ```
1475///
1476/// If a future measurement shows a trigger's marking-context FP
1477/// rate above ~1% (e.g., a corpus that contains heavy use of one
1478/// of these tokens in a marking-adjacent way), this cap should
1479/// drop or the per-trigger plumbing originally proposed for PR 4
1480/// should land. Pinned at the engine boundary by
1481/// `engine::tests::heuristic_rule_axis_cap_matches_default_threshold`.
1482const HEURISTIC_RULE_AXIS_CAP: f32 = 0.95;
1483
1484// ---------------------------------------------------------------------------
1485// Rule-override canonicalization (task #49)
1486// ---------------------------------------------------------------------------
1487
1488/// Resolve every key in `config.rules.overrides` against the registered
1489/// rule sets. Both the rule ID (`"E001"`) and the rule name
1490/// (`"portion-mark-in-banner"`) are accepted — after canonicalization
1491/// the override map keys by canonical ID only, and the per-rule lookup
1492/// in `lint()` / `fix_inner()` keeps working unchanged.
1493///
1494/// Fails closed on:
1495/// - **Unknown keys** — `E999 = "warn"` or `not-a-rule = "error"` → the
1496///   user has almost certainly typo'd a rule reference. Silent acceptance
1497///   (the pre-#49 behavior) means the user thought they were configuring
1498///   the rule, but nothing happened at lint time. Emits
1499///   `EngineConstructionError::UnknownRuleOverride` with a best-effort
1500///   `did_you_mean` suggestion (Levenshtein ≤ 3 against the union of
1501///   known IDs and names).
1502/// - **Conflicting duplicate forms** — `E001 = "warn"` AND
1503///   `portion-mark-in-banner = "error"` in the same merged config →
1504///   the two entries resolved to the same rule but with different
1505///   severities. One form would have silently won the HashMap race.
1506///   Emits `EngineConstructionError::ConflictingRuleOverride`.
1507///
1508/// Duplicate forms with the *same* severity are silently accepted —
1509/// a user writing both `E001 = "warn"` and `portion-mark-in-banner =
1510/// "warn"` (intentionally or via copy-paste across config layers) gets
1511/// the expected behavior.
1512fn canonicalize_rule_overrides(
1513    config: &mut Config,
1514    rule_sets: &[Box<dyn RuleSet>],
1515) -> Result<(), EngineConstructionError> {
1516    if config.rules.overrides.is_empty() {
1517        return Ok(());
1518    }
1519
1520    // Build the ID-and-name → canonical-ID lookup. Both sides live in
1521    // `&'static str` (RuleId's inner slice, rule.name()), so the map's
1522    // keys and values are all `'static`.
1523    let mut known: HashMap<&'static str, &'static str> = HashMap::new();
1524    for rule_set in rule_sets {
1525        for rule in rule_set.rules() {
1526            let id_str = rule.id().as_str();
1527            let name = rule.name();
1528            known.insert(id_str, id_str);
1529            known.insert(name, id_str);
1530        }
1531    }
1532
1533    // Walk the raw overrides; resolve each key to its canonical ID, and
1534    // track which source key contributed each canonical entry so we can
1535    // report both sides of a conflict.
1536    let raw = std::mem::take(&mut config.rules.overrides);
1537    let mut by_rule: HashMap<&'static str, (String, String)> = HashMap::new();
1538    for (key, value) in raw {
1539        match known.get(key.as_str()) {
1540            Some(&canonical_id) => {
1541                if let Some((prev_key, prev_sev)) = by_rule.get(canonical_id) {
1542                    if prev_sev != &value {
1543                        return Err(EngineConstructionError::ConflictingRuleOverride {
1544                            rule_id: canonical_id.to_owned(),
1545                            keys: Box::new([prev_key.clone(), key]),
1546                            severities: Box::new([prev_sev.clone(), value]),
1547                        });
1548                    }
1549                    // Duplicate form, same severity — accept silently.
1550                } else {
1551                    by_rule.insert(canonical_id, (key, value));
1552                }
1553            }
1554            None => {
1555                let did_you_mean = suggest_closest(&key, known.keys().copied());
1556                return Err(EngineConstructionError::UnknownRuleOverride { key, did_you_mean });
1557            }
1558        }
1559    }
1560
1561    config.rules.overrides = by_rule
1562        .into_iter()
1563        .map(|(id, (_, sev))| (id.to_owned(), sev))
1564        .collect();
1565    Ok(())
1566}
1567
1568/// Best-effort string extraction from a `catch_unwind` payload.
1569///
1570/// Rust panic payloads are `Box<dyn Any + Send>`. The standard
1571/// shapes a `panic!()` produces are `&'static str` (literal message)
1572/// and `String` (formatted message); arbitrary types are also
1573/// permissible. We try the two common cases and fall back to a
1574/// generic placeholder so the warning we emit always carries
1575/// *something* identifying the rule even if a future crate panics
1576/// with a custom payload type.
1577fn panic_payload_to_string(
1578    payload: &Box<dyn std::any::Any + Send + 'static>,
1579) -> std::borrow::Cow<'static, str> {
1580    if let Some(s) = payload.downcast_ref::<&'static str>() {
1581        std::borrow::Cow::Borrowed(*s)
1582    } else if let Some(s) = payload.downcast_ref::<String>() {
1583        std::borrow::Cow::Owned(s.clone())
1584    } else {
1585        std::borrow::Cow::Borrowed("<unstringifiable panic payload>")
1586    }
1587}
1588
1589/// Return the closest known rule key (ID or name) to `needle` by
1590/// Levenshtein distance, if the closest candidate is within a small
1591/// edit-distance threshold. Threshold scales with `needle.len()`: short
1592/// strings only match on ≤ 1 edit, longer strings tolerate more.
1593///
1594/// Returns `None` when no candidate is close enough to be useful —
1595/// "did you mean 'REL-TO-noforn-supersession'?" for a user who typed
1596/// "E999" would be worse than no suggestion at all.
1597fn suggest_closest<'a, I>(needle: &str, candidates: I) -> Option<String>
1598where
1599    I: Iterator<Item = &'a str>,
1600{
1601    // Keep the threshold tight so we don't suggest matches that share
1602    // only a couple of characters. The max-distance formula mirrors
1603    // what rustc uses for its "did you mean" hints:
1604    //   - length 0–3: 1 edit max (too short to suggest at all, really)
1605    //   - length 4–7: 2 edits max
1606    //   - length 8+:  3 edits max
1607    let max_distance = match needle.len() {
1608        0..=3 => 1,
1609        4..=7 => 2,
1610        _ => 3,
1611    };
1612
1613    let mut best: Option<(&'a str, usize)> = None;
1614    for cand in candidates {
1615        let dist = levenshtein(needle, cand);
1616        if dist > max_distance {
1617            continue;
1618        }
1619        match best {
1620            Some((_, prev_dist)) if dist >= prev_dist => {}
1621            _ => best = Some((cand, dist)),
1622        }
1623    }
1624    best.map(|(cand, _)| cand.to_owned())
1625}
1626
1627/// Levenshtein edit distance between two byte strings. Small, inlineable,
1628/// no external dependency — the engine crate is on the WASM-safe surface
1629/// and adding a new runtime dep for a once-per-construction helper would
1630/// be a disproportionate trade (Constitution III).
1631///
1632/// Operates on bytes, not `char`s: rule IDs and names are ASCII by
1633/// construction, so the byte-level diff equals the codepoint-level diff.
1634fn levenshtein(a: &str, b: &str) -> usize {
1635    let a = a.as_bytes();
1636    let b = b.as_bytes();
1637    let (m, n) = (a.len(), b.len());
1638    if m == 0 {
1639        return n;
1640    }
1641    if n == 0 {
1642        return m;
1643    }
1644    // Two-row DP: only the previous row is needed at any step.
1645    let mut prev: Vec<usize> = (0..=n).collect();
1646    let mut curr: Vec<usize> = vec![0; n + 1];
1647    for i in 1..=m {
1648        curr[0] = i;
1649        for j in 1..=n {
1650            let cost = if a[i - 1] == b[j - 1] { 0 } else { 1 };
1651            curr[j] = (prev[j] + 1).min(curr[j - 1] + 1).min(prev[j - 1] + cost);
1652        }
1653        std::mem::swap(&mut prev, &mut curr);
1654    }
1655    prev[n]
1656}
1657
1658// ---------------------------------------------------------------------------
1659// Tests
1660// ---------------------------------------------------------------------------
1661
1662#[cfg(test)]
1663#[cfg_attr(coverage_nightly, coverage(off))]
1664mod tests {
1665    use super::*;
1666    use crate::clock::FixedClock;
1667    use marque_ism::IsmAttributes;
1668    use marque_rules::{
1669        Diagnostic, FixProposal, FixSource, Rule, RuleContext, RuleId, RuleSet, Severity,
1670    };
1671    use std::time::{Duration, UNIX_EPOCH};
1672
1673    #[test]
1674    fn heuristic_rule_axis_cap_matches_default_threshold() {
1675        // Issue #133 PR 4 invariant: the position-aware classification
1676        // heuristic's `Confidence::rule` cap is pinned at the default
1677        // `confidence_threshold` (0.95). Solo-candidate heuristic
1678        // fixes auto-apply at the default threshold; the empirical
1679        // corpus measurement (see `HEURISTIC_RULE_AXIS_CAP` doc and
1680        // `tools/corpus-analysis/output/heuristic_frequencies.json`)
1681        // justifies confidence ≥ 99.4% per-trigger, comfortably above
1682        // the cap.
1683        //
1684        // If a future change drops `HEURISTIC_RULE_AXIS_CAP` below
1685        // `Config::default().confidence_threshold()`, that's a
1686        // behavioral regression: heuristic fixes that previously auto-
1687        // applied at the default threshold would silently stop
1688        // applying, and the user-visible "fix-and-warn" surface
1689        // collapses to "warn-only-without-fix" without an explicit
1690        // intent recorded in the change.
1691        //
1692        // If a future change drops the default `confidence_threshold`
1693        // below `HEURISTIC_RULE_AXIS_CAP`, that's the inverse problem:
1694        // the heuristic suddenly becomes more aggressive than the
1695        // governance signal we agreed on. Either way, the equality
1696        // pin here forces a coordinated decision.
1697        let default_threshold = Config::default().confidence_threshold();
1698        assert!(
1699            (HEURISTIC_RULE_AXIS_CAP - default_threshold).abs() < 1e-6,
1700            "HEURISTIC_RULE_AXIS_CAP={HEURISTIC_RULE_AXIS_CAP} must equal \
1701             Config::default().confidence_threshold()={default_threshold}; \
1702             a divergence requires an intentional governance change recorded \
1703             in the cap's doc comment"
1704        );
1705    }
1706
1707    /// A test rule that emits a fixed list of FixProposals on every check call,
1708    /// ignoring the parsed attributes. Lets us drive the engine deterministically
1709    /// without depending on real CAPCO rule output.
1710    struct StubRule {
1711        id: &'static str,
1712        proposals: Vec<FixProposal>,
1713    }
1714
1715    impl Rule for StubRule {
1716        fn id(&self) -> RuleId {
1717            RuleId::new(self.id)
1718        }
1719        fn name(&self) -> &'static str {
1720            "stub"
1721        }
1722        fn default_severity(&self) -> Severity {
1723            Severity::Fix
1724        }
1725        fn check(&self, _attrs: &IsmAttributes, _ctx: &RuleContext) -> Vec<Diagnostic> {
1726            self.proposals
1727                .iter()
1728                .map(|p| {
1729                    Diagnostic::new(
1730                        p.rule.clone(),
1731                        Severity::Fix,
1732                        p.span,
1733                        "stub",
1734                        "TEST",
1735                        Some(p.clone()),
1736                    )
1737                })
1738                .collect()
1739        }
1740    }
1741
1742    struct StubSet(Vec<Box<dyn Rule>>);
1743    impl RuleSet for StubSet {
1744        fn rules(&self) -> &[Box<dyn Rule>] {
1745            &self.0
1746        }
1747        fn schema_version(&self) -> &'static str {
1748            "TEST"
1749        }
1750    }
1751
1752    fn proposal(rule: &'static str, start: usize, end: usize, replacement: &str) -> FixProposal {
1753        proposal_with_confidence(rule, start, end, replacement, 1.0)
1754    }
1755
1756    fn proposal_with_confidence(
1757        rule: &'static str,
1758        start: usize,
1759        end: usize,
1760        replacement: &str,
1761        confidence: f32,
1762    ) -> FixProposal {
1763        FixProposal::new(
1764            RuleId::new(rule),
1765            FixSource::BuiltinRule,
1766            Span::new(start, end),
1767            "x",
1768            replacement,
1769            marque_rules::Confidence::strict(confidence),
1770            None,
1771        )
1772    }
1773
1774    fn engine_with(proposals: Vec<FixProposal>) -> Engine {
1775        engine_with_config(Config::default(), proposals)
1776    }
1777
1778    fn engine_with_config(config: Config, proposals: Vec<FixProposal>) -> Engine {
1779        let stub = StubRule {
1780            id: "TEST",
1781            proposals,
1782        };
1783        let set: Box<dyn RuleSet> = Box::new(StubSet(vec![Box::new(stub)]));
1784        Engine::with_clock(
1785            config,
1786            vec![set],
1787            marque_capco::scheme::CapcoScheme::new(),
1788            Box::new(FixedClock::new(
1789                UNIX_EPOCH + Duration::from_secs(1_700_000_000),
1790            )),
1791        )
1792        .expect("default CAPCO scheme has no rewrite cycles")
1793    }
1794
1795    /// A source long enough to span the test fix offsets, AND containing a
1796    /// banner marking so the parser produces a candidate that triggers
1797    /// the rule loop in `Engine::lint`.
1798    const TEST_SRC: &[u8] = b"SECRET//NOFORN                                                ";
1799
1800    #[test]
1801    fn fix_applies_disjoint_fixes_in_reverse_order() {
1802        // Two non-overlapping fixes; FR-016 sorts by span.end DESC so the
1803        // later one is applied first, preserving the earlier span's offsets.
1804        let engine = engine_with(vec![
1805            proposal("E001", 0, 6, "AA"),  // "SECRET" → "AA"
1806            proposal("E002", 8, 14, "BB"), // "NOFORN" → "BB"
1807        ]);
1808        let result = engine.fix(TEST_SRC, FixMode::Apply);
1809        let out = String::from_utf8(result.source).unwrap();
1810        assert!(out.starts_with("AA//BB"), "got: {out:?}");
1811        assert_eq!(result.applied.len(), 2);
1812    }
1813
1814    #[test]
1815    fn overlap_guard_drops_overlapping_fix() {
1816        // Two fixes whose spans collide. C-1: keep one, drop the other.
1817        let engine = engine_with(vec![
1818            proposal("E001", 0, 6, "AA"),
1819            proposal("E002", 3, 10, "BB"), // overlaps E001
1820        ]);
1821        let result = engine.fix(TEST_SRC, FixMode::Apply);
1822        // Exactly one fix should be applied, the other should remain in
1823        // `remaining_diagnostics` so callers can see it was not silently
1824        // dropped.
1825        assert_eq!(result.applied.len(), 1, "applied: {:?}", result.applied);
1826        assert_eq!(
1827            result.remaining_diagnostics.len(),
1828            1,
1829            "remaining: {:?}",
1830            result.remaining_diagnostics
1831        );
1832    }
1833
1834    #[test]
1835    fn dry_run_returns_original_source_but_records_applied() {
1836        let engine = engine_with(vec![proposal("E001", 0, 6, "AA")]);
1837        let result = engine.fix(TEST_SRC, FixMode::DryRun);
1838        assert_eq!(result.source, TEST_SRC, "dry-run must not mutate source");
1839        assert_eq!(result.applied.len(), 1);
1840        assert!(result.applied[0].dry_run, "dry_run flag must be set");
1841    }
1842
1843    #[test]
1844    fn fix_with_threshold_rejects_nan() {
1845        let engine = engine_with(vec![]);
1846        assert!(matches!(
1847            engine.fix_with_threshold(TEST_SRC, FixMode::Apply, Some(f32::NAN)),
1848            Err(InvalidThreshold(_))
1849        ));
1850    }
1851
1852    #[test]
1853    fn fix_with_threshold_rejects_out_of_range() {
1854        let engine = engine_with(vec![]);
1855        assert!(matches!(
1856            engine.fix_with_threshold(TEST_SRC, FixMode::Apply, Some(-0.1)),
1857            Err(InvalidThreshold(_))
1858        ));
1859        assert!(matches!(
1860            engine.fix_with_threshold(TEST_SRC, FixMode::Apply, Some(1.1)),
1861            Err(InvalidThreshold(_))
1862        ));
1863    }
1864
1865    #[test]
1866    fn fix_with_threshold_accepts_boundaries() {
1867        let engine = engine_with(vec![]);
1868        assert!(
1869            engine
1870                .fix_with_threshold(TEST_SRC, FixMode::Apply, Some(0.0))
1871                .is_ok()
1872        );
1873        assert!(
1874            engine
1875                .fix_with_threshold(TEST_SRC, FixMode::Apply, Some(1.0))
1876                .is_ok()
1877        );
1878    }
1879
1880    #[test]
1881    fn fixed_clock_yields_deterministic_timestamps() {
1882        let engine = engine_with(vec![proposal("E001", 0, 6, "AA")]);
1883        let r1 = engine.fix(TEST_SRC, FixMode::Apply);
1884        let r2 = engine.fix(TEST_SRC, FixMode::Apply);
1885        assert_eq!(r1.applied[0].timestamp, r2.applied[0].timestamp);
1886    }
1887
1888    // H-3: fix_with_threshold must reject non-finite overrides in all
1889    // directions, not just NaN. INFINITY and NEG_INFINITY are both caught
1890    // by the range check; this test pins that behavior so a future refactor
1891    // that uses e.g. `is_finite` instead of `contains + is_nan` cannot
1892    // silently regress.
1893    #[test]
1894    fn fix_with_threshold_rejects_infinity() {
1895        let engine = engine_with(vec![]);
1896        assert!(matches!(
1897            engine.fix_with_threshold(TEST_SRC, FixMode::Apply, Some(f32::INFINITY)),
1898            Err(InvalidThreshold(_))
1899        ));
1900        assert!(matches!(
1901            engine.fix_with_threshold(TEST_SRC, FixMode::Apply, Some(f32::NEG_INFINITY)),
1902            Err(InvalidThreshold(_))
1903        ));
1904    }
1905
1906    // M-4: the confidence filter at `f.confidence.combined() >= threshold`
1907    // is on the hot path of Engine::fix. These two tests pin the `>=`
1908    // semantics so a future refactor that flips it to `>` (or vice versa)
1909    // is caught. "Confidence" here is the scalar `Confidence::combined()`
1910    // (= recognition × rule); the other axes (`region`, `runner_up_ratio`,
1911    // feature contributions) are audit-provenance metadata and do not
1912    // participate in the threshold gate.
1913    #[test]
1914    fn confidence_below_default_threshold_is_excluded() {
1915        // Config::default().confidence_threshold == 0.95. A fix at 0.94
1916        // must not be applied.
1917        let engine = engine_with(vec![proposal_with_confidence("E001", 0, 6, "AA", 0.94)]);
1918        let result = engine.fix(TEST_SRC, FixMode::Apply);
1919        assert_eq!(result.applied.len(), 0);
1920        // The below-threshold fix is a suggestion — it survives in
1921        // remaining_diagnostics so the caller can surface it.
1922        assert_eq!(result.remaining_diagnostics.len(), 1);
1923    }
1924
1925    #[test]
1926    fn lint_rewrites_below_threshold_fix_severity_to_suggest() {
1927        // Issue #235 / #186 PR-3: the lint post-pass turns a Fix-severity
1928        // diagnostic carrying a sub-threshold proposal into a Suggest-
1929        // severity diagnostic, preserving the fix payload so the renderer
1930        // can show "did you mean?" instead of silently dropping the
1931        // candidate at the threshold gate.
1932        let engine = engine_with(vec![proposal_with_confidence("E001", 0, 6, "AA", 0.5)]);
1933        let lint = engine.lint(TEST_SRC);
1934        assert_eq!(lint.diagnostics.len(), 1);
1935        assert_eq!(lint.diagnostics[0].severity, Severity::Suggest);
1936        assert!(
1937            lint.diagnostics[0].fix.is_some(),
1938            "the candidate fix must stay attached so the renderer can surface it"
1939        );
1940        assert_eq!(lint.suggest_count(), 1);
1941        // Confirm the engine still excludes Suggest from auto-apply.
1942        let fix_result = engine.fix(TEST_SRC, FixMode::Apply);
1943        assert_eq!(fix_result.applied.len(), 0);
1944    }
1945
1946    #[test]
1947    fn lint_does_not_rewrite_at_threshold_boundary() {
1948        // A fix at exactly the threshold (0.95) must NOT be rewritten
1949        // — it is auto-apply territory, not Suggest territory. This
1950        // pins the boundary semantics: the rewrite predicate is
1951        // strictly less-than, matching the engine's `>= threshold`
1952        // application gate.
1953        let engine = engine_with(vec![proposal_with_confidence("E001", 0, 6, "AA", 0.95)]);
1954        let lint = engine.lint(TEST_SRC);
1955        assert_eq!(lint.diagnostics.len(), 1);
1956        assert_eq!(lint.diagnostics[0].severity, Severity::Fix);
1957    }
1958
1959    #[test]
1960    fn lint_post_pass_leaves_fix_severity_with_no_fix_payload_alone() {
1961        // The post-pass guard order matters: even though `Fix`-severity
1962        // diagnostics are the only ones eligible for the rewrite, a
1963        // diagnostic that doesn't carry a `FixProposal` (rare in
1964        // practice — `Fix`-severity rules normally always attach one
1965        // — but representable in the type) must be skipped by the
1966        // `let Some(fix) = d.fix.as_ref() else { continue }` arm and
1967        // keep its `Fix` severity. This pins the behavior so a future
1968        // refactor that hoists the threshold check above the fix-
1969        // presence check (and might rewrite to Suggest unconditionally)
1970        // is caught.
1971        struct FixWithoutProposalRule;
1972        impl Rule for FixWithoutProposalRule {
1973            fn id(&self) -> RuleId {
1974                RuleId::new("E997")
1975            }
1976            fn name(&self) -> &'static str {
1977                "stub-fix-no-proposal"
1978            }
1979            fn default_severity(&self) -> Severity {
1980                Severity::Fix
1981            }
1982            fn check(&self, _attrs: &IsmAttributes, _ctx: &RuleContext) -> Vec<Diagnostic> {
1983                vec![Diagnostic::new(
1984                    RuleId::new("E997"),
1985                    Severity::Fix,
1986                    Span::new(0, 6),
1987                    "fix-severity diagnostic with no proposal",
1988                    "TEST",
1989                    None,
1990                )]
1991            }
1992        }
1993
1994        let set: Box<dyn RuleSet> = Box::new(StubSet(vec![Box::new(FixWithoutProposalRule)]));
1995        let engine = Engine::with_clock(
1996            Config::default(),
1997            vec![set],
1998            marque_capco::scheme::CapcoScheme::new(),
1999            Box::new(FixedClock::new(
2000                UNIX_EPOCH + Duration::from_secs(1_700_000_000),
2001            )),
2002        )
2003        .expect("default CAPCO scheme has no rewrite cycles");
2004
2005        let lint = engine.lint(TEST_SRC);
2006        assert_eq!(lint.diagnostics.len(), 1);
2007        assert_eq!(
2008            lint.diagnostics[0].severity,
2009            Severity::Fix,
2010            "Fix-severity diagnostic with no fix payload must NOT be rewritten to Suggest",
2011        );
2012        assert!(lint.diagnostics[0].fix.is_none());
2013    }
2014
2015    #[test]
2016    fn fix_excludes_explicit_suggest_severity_from_auto_apply() {
2017        // Issue #235 / #186 PR-3: a rule that emits at Severity::Suggest
2018        // directly with confidence ≥ threshold must STILL be excluded
2019        // from auto-apply by construction. The Suggest channel is a
2020        // hard "do not apply" signal regardless of the confidence
2021        // axis. This is the explicit-Suggest invariant; the StubRule
2022        // emits Fix-severity by default so we route through a custom
2023        // rule that emits Suggest directly.
2024        struct SuggestRule;
2025        impl Rule for SuggestRule {
2026            fn id(&self) -> RuleId {
2027                RuleId::new("S999")
2028            }
2029            fn name(&self) -> &'static str {
2030                "stub-suggest"
2031            }
2032            fn default_severity(&self) -> Severity {
2033                Severity::Suggest
2034            }
2035            fn check(&self, _attrs: &IsmAttributes, _ctx: &RuleContext) -> Vec<Diagnostic> {
2036                let proposal = FixProposal::new(
2037                    RuleId::new("S999"),
2038                    FixSource::BuiltinRule,
2039                    Span::new(0, 6),
2040                    "SECRET",
2041                    "TOP SECRET",
2042                    marque_rules::Confidence::strict(1.0),
2043                    None,
2044                );
2045                vec![Diagnostic::new(
2046                    RuleId::new("S999"),
2047                    Severity::Suggest,
2048                    Span::new(0, 6),
2049                    "explicit suggest with high confidence",
2050                    "TEST",
2051                    Some(proposal),
2052                )]
2053            }
2054        }
2055
2056        let set: Box<dyn RuleSet> = Box::new(StubSet(vec![Box::new(SuggestRule)]));
2057        let engine = Engine::with_clock(
2058            Config::default(),
2059            vec![set],
2060            marque_capco::scheme::CapcoScheme::new(),
2061            Box::new(FixedClock::new(
2062                UNIX_EPOCH + Duration::from_secs(1_700_000_000),
2063            )),
2064        )
2065        .expect("default CAPCO scheme has no rewrite cycles");
2066
2067        let lint = engine.lint(TEST_SRC);
2068        assert_eq!(lint.diagnostics.len(), 1);
2069        // Severity stays Suggest (post-pass leaves explicit Suggest alone).
2070        assert_eq!(lint.diagnostics[0].severity, Severity::Suggest);
2071        // Even at confidence 1.0, a Suggest-severity fix must not auto-apply.
2072        let fix_result = engine.fix(TEST_SRC, FixMode::Apply);
2073        assert_eq!(
2074            fix_result.applied.len(),
2075            0,
2076            "explicit Suggest-severity fix must not auto-apply regardless of confidence"
2077        );
2078    }
2079
2080    #[test]
2081    fn confidence_at_default_threshold_is_included() {
2082        // A fix at exactly 0.95 must be applied (inclusive threshold).
2083        let engine = engine_with(vec![proposal_with_confidence("E001", 0, 6, "AA", 0.95)]);
2084        let result = engine.fix(TEST_SRC, FixMode::Apply);
2085        assert_eq!(result.applied.len(), 1);
2086    }
2087
2088    // M-5: the zero-length-span filter (`!f.span.is_empty()`) in fix_inner
2089    // is what masked the Phase 2 Span::new(0, 0) placeholders from the
2090    // C-1 overlap guard. This test pins that guard explicitly so a future
2091    // refactor that drops the filter is caught.
2092    #[test]
2093    fn zero_length_span_fix_is_filtered_before_sort() {
2094        let engine = engine_with(vec![proposal("E001", 5, 5, "X")]);
2095        let result = engine.fix(TEST_SRC, FixMode::Apply);
2096        assert_eq!(result.applied.len(), 0);
2097        // Source unchanged: no splice was attempted.
2098        assert_eq!(result.source, TEST_SRC);
2099    }
2100
2101    // L-4: all the other threshold tests go through fix_with_threshold
2102    // (override path). This exercises the Config-supplied path explicitly
2103    // so both branches of `fix_with_threshold_inner`'s threshold selection
2104    // are covered.
2105    #[test]
2106    fn config_supplied_threshold_filters_proposals() {
2107        let mut config = Config::default();
2108        config.set_confidence_threshold(0.5).unwrap();
2109        let engine = engine_with_config(
2110            config,
2111            vec![
2112                proposal_with_confidence("E001", 0, 6, "AA", 0.4), // below
2113                proposal_with_confidence("E002", 8, 14, "BB", 0.6), // above
2114            ],
2115        );
2116        let result = engine.fix(TEST_SRC, FixMode::Apply);
2117        // Only the 0.6 fix is applied.
2118        assert_eq!(result.applied.len(), 1);
2119        assert_eq!(result.applied[0].proposal.rule.as_str(), "E002");
2120        // The 0.4 fix surfaces as a remaining diagnostic.
2121        assert_eq!(result.remaining_diagnostics.len(), 1);
2122    }
2123
2124    // Phase 3 Task 2: PageBreak candidates must reset the engine's
2125    // PageContext accumulator. Without this, banner-validation rules on
2126    // the second page would see portions from the first page, producing
2127    // over-restrictive expected aggregates.
2128    #[test]
2129    fn lint_handles_multi_page_document_with_form_feed() {
2130        let src: &[u8] = b"(SECRET//NOFORN) page 1 body.\nSECRET//NOFORN\n\x0c(CONFIDENTIAL) page 2 body.\nCONFIDENTIAL\n";
2131        let engine = engine_with(vec![]);
2132        let result = engine.lint(src);
2133        // Stub rule with no proposals: clean lint, no panic, no parser
2134        // error from the page-break candidate (which is filtered before
2135        // parser.parse is called).
2136        assert!(result.is_clean());
2137    }
2138
2139    // F.1: PageContext reset semantics are observable.
2140    //
2141    // ContextRecorderRule captures the live `page_context.portion_count()`
2142    // every time it's invoked. By running the engine over a multi-page
2143    // document and inspecting the captured counts at each banner candidate,
2144    // we prove that the engine resets PageContext at the page break instead
2145    // of accumulating across pages.
2146    #[derive(Clone)]
2147    struct ContextRecorderRule {
2148        observations: std::sync::Arc<std::sync::Mutex<Vec<(marque_ism::MarkingType, usize)>>>,
2149    }
2150
2151    impl Rule for ContextRecorderRule {
2152        fn id(&self) -> RuleId {
2153            RuleId::new("RECORD")
2154        }
2155        fn name(&self) -> &'static str {
2156            "page-context-recorder"
2157        }
2158        fn default_severity(&self) -> Severity {
2159            Severity::Warn
2160        }
2161        fn check(&self, _attrs: &IsmAttributes, ctx: &RuleContext) -> Vec<Diagnostic> {
2162            let count = ctx
2163                .page_context
2164                .as_ref()
2165                .map(|pc| pc.portion_count())
2166                .unwrap_or(0);
2167            self.observations
2168                .lock()
2169                .unwrap()
2170                .push((ctx.marking_type, count));
2171            vec![]
2172        }
2173    }
2174
2175    struct RecorderSet(Vec<Box<dyn Rule>>);
2176    impl RuleSet for RecorderSet {
2177        fn rules(&self) -> &[Box<dyn Rule>] {
2178            &self.0
2179        }
2180        fn schema_version(&self) -> &'static str {
2181            "TEST"
2182        }
2183    }
2184
2185    #[test]
2186    fn page_context_resets_observably_across_form_feed() {
2187        use marque_ism::MarkingType;
2188        let observations = std::sync::Arc::new(std::sync::Mutex::new(Vec::new()));
2189        let rule = ContextRecorderRule {
2190            observations: std::sync::Arc::clone(&observations),
2191        };
2192        let set: Box<dyn RuleSet> = Box::new(RecorderSet(vec![Box::new(rule)]));
2193        let engine = Engine::with_clock(
2194            Config::default(),
2195            vec![set],
2196            marque_capco::scheme::CapcoScheme::new(),
2197            Box::new(FixedClock::new(
2198                UNIX_EPOCH + Duration::from_secs(1_700_000_000),
2199            )),
2200        )
2201        .expect("default CAPCO scheme has no rewrite cycles");
2202
2203        // Two pages, separated by a form feed:
2204        //   Page 1: one portion + one banner
2205        //   Page break (\f)
2206        //   Page 2: one portion + one banner
2207        //
2208        // The recorder fires on every candidate that reaches the rule loop.
2209        // For the page-1 banner we expect to see 1 accumulated portion.
2210        // For the page-2 banner we expect to see 1 accumulated portion
2211        // (NOT 2) — the form feed must have reset the context.
2212        let src: &[u8] = b"(SECRET//NF) p1 text\nSECRET//NOFORN\n\x0c(CONFIDENTIAL//NF) p2\nCONFIDENTIAL//NOFORN\n";
2213        let _ = engine.lint(src);
2214
2215        let obs = observations.lock().unwrap();
2216        // The recorder ran once per non-PageBreak candidate. Filter to
2217        // banners and check the page_context count each banner saw.
2218        let banner_counts: Vec<usize> = obs
2219            .iter()
2220            .filter(|(kind, _)| *kind == MarkingType::Banner)
2221            .map(|(_, count)| *count)
2222            .collect();
2223        assert_eq!(
2224            banner_counts.len(),
2225            2,
2226            "expected 2 banner observations, got: {obs:?}"
2227        );
2228        assert_eq!(
2229            banner_counts[0], 1,
2230            "page-1 banner should see 1 accumulated portion"
2231        );
2232        assert_eq!(
2233            banner_counts[1], 1,
2234            "page-2 banner should see 1 accumulated portion (the page-1 \
2235             portion must be cleared by the form feed)"
2236        );
2237    }
2238
2239    #[test]
2240    fn page_context_lint_starts_fresh_on_each_call() {
2241        // Calling Engine::lint twice on the same engine must produce a
2242        // fresh PageContext for the second call — no cross-call accumulation.
2243        use marque_ism::MarkingType;
2244        let observations = std::sync::Arc::new(std::sync::Mutex::new(Vec::new()));
2245        let rule = ContextRecorderRule {
2246            observations: std::sync::Arc::clone(&observations),
2247        };
2248        let set: Box<dyn RuleSet> = Box::new(RecorderSet(vec![Box::new(rule)]));
2249        let engine = Engine::with_clock(
2250            Config::default(),
2251            vec![set],
2252            marque_capco::scheme::CapcoScheme::new(),
2253            Box::new(FixedClock::new(
2254                UNIX_EPOCH + Duration::from_secs(1_700_000_000),
2255            )),
2256        )
2257        .expect("default CAPCO scheme has no rewrite cycles");
2258        let src: &[u8] = b"(SECRET//NF) text\nSECRET//NOFORN\n";
2259        let _ = engine.lint(src);
2260        let _ = engine.lint(src);
2261
2262        let obs = observations.lock().unwrap();
2263        // Both calls should see identical observations — if the second
2264        // call leaked state from the first, the page-2 banner_count would
2265        // double.
2266        let banner_counts: Vec<usize> = obs
2267            .iter()
2268            .filter(|(kind, _)| *kind == MarkingType::Banner)
2269            .map(|(_, count)| *count)
2270            .collect();
2271        assert_eq!(
2272            banner_counts.len(),
2273            2,
2274            "two lint calls should produce two banner observations"
2275        );
2276        assert_eq!(banner_counts, vec![1, 1]);
2277    }
2278
2279    // M6: FR-016 tiebreaker — same span, different rule IDs.
2280    // The sort is (span.end DESC, span.start DESC, rule_id ASC, replacement ASC).
2281    // When two fixes target the exact same span, rule_id ASC breaks the tie,
2282    // and C-1 drops the second (overlapping) fix.
2283    #[test]
2284    fn fr016_same_span_different_rule_ids_picks_lower_rule_id() {
2285        // Two proposals for span 0..6 with different rule IDs.
2286        // "C001" < "E001" lexicographically, so C001 is kept and E001 dropped.
2287        let engine = engine_with(vec![
2288            proposal("E001", 0, 6, "BB"),
2289            proposal("C001", 0, 6, "AA"),
2290        ]);
2291        let result = engine.fix(TEST_SRC, FixMode::Apply);
2292        assert_eq!(result.applied.len(), 1);
2293        assert_eq!(result.applied[0].proposal.rule.as_str(), "C001");
2294        assert_eq!(result.applied[0].proposal.replacement.as_ref(), "AA");
2295    }
2296
2297    // FR-016 tiebreaker — same span, same rule ID, different replacements.
2298    #[test]
2299    fn fr016_same_span_same_rule_picks_lower_replacement() {
2300        let engine = engine_with(vec![
2301            proposal("E001", 0, 6, "ZZZ"),
2302            proposal("E001", 0, 6, "AAA"),
2303        ]);
2304        let result = engine.fix(TEST_SRC, FixMode::Apply);
2305        assert_eq!(result.applied.len(), 1);
2306        assert_eq!(result.applied[0].proposal.replacement.as_ref(), "AAA");
2307    }
2308
2309    // -----------------------------------------------------------------------
2310    // Task #49 — rule-alias canonicalization + fail-loud on unknown keys
2311    // -----------------------------------------------------------------------
2312
2313    /// Stub rule with distinct, test-controlled id and name so we can
2314    /// exercise the alias-resolution logic. The base `StubRule` hardcodes
2315    /// `name() -> "stub"`, which collides across multiple rules and
2316    /// doesn't model real CAPCO rules.
2317    struct NamedStub {
2318        id: &'static str,
2319        name: &'static str,
2320    }
2321
2322    impl Rule for NamedStub {
2323        fn id(&self) -> RuleId {
2324            RuleId::new(self.id)
2325        }
2326        fn name(&self) -> &'static str {
2327            self.name
2328        }
2329        fn default_severity(&self) -> Severity {
2330            Severity::Warn
2331        }
2332        fn check(&self, _attrs: &IsmAttributes, _ctx: &RuleContext) -> Vec<Diagnostic> {
2333            vec![]
2334        }
2335    }
2336
2337    fn named_rule_set(rules: &[(&'static str, &'static str)]) -> Box<dyn RuleSet> {
2338        let rules: Vec<Box<dyn Rule>> = rules
2339            .iter()
2340            .map(|(id, name)| Box::new(NamedStub { id, name }) as Box<dyn Rule>)
2341            .collect();
2342        Box::new(StubSet(rules))
2343    }
2344
2345    fn config_with_overrides(pairs: &[(&str, &str)]) -> Config {
2346        let mut config = Config::default();
2347        for (k, v) in pairs {
2348            config
2349                .rules
2350                .overrides
2351                .insert((*k).to_owned(), (*v).to_owned());
2352        }
2353        config
2354    }
2355
2356    #[test]
2357    fn canonicalize_accepts_rule_id_form_unchanged() {
2358        let mut config = config_with_overrides(&[("E001", "warn")]);
2359        let sets = vec![named_rule_set(&[("E001", "portion-mark-in-banner")])];
2360        canonicalize_rule_overrides(&mut config, &sets).expect("should succeed");
2361        assert_eq!(
2362            config.rules.overrides.get("E001"),
2363            Some(&"warn".to_owned()),
2364            "ID-form override keeps its key"
2365        );
2366    }
2367
2368    #[test]
2369    fn canonicalize_accepts_rule_name_form_and_resolves_to_id() {
2370        let mut config = config_with_overrides(&[("portion-mark-in-banner", "error")]);
2371        let sets = vec![named_rule_set(&[("E001", "portion-mark-in-banner")])];
2372        canonicalize_rule_overrides(&mut config, &sets).expect("should succeed");
2373        assert_eq!(
2374            config.rules.overrides.get("E001"),
2375            Some(&"error".to_owned()),
2376            "name-form override resolves to canonical ID"
2377        );
2378        assert!(
2379            !config
2380                .rules
2381                .overrides
2382                .contains_key("portion-mark-in-banner"),
2383            "pre-canonicalization name key must not survive"
2384        );
2385    }
2386
2387    #[test]
2388    fn canonicalize_rejects_unknown_key_with_suggestion_for_near_miss() {
2389        let mut config = config_with_overrides(&[("E00l", "warn")]); // lowercase-L, not 1
2390        let sets = vec![named_rule_set(&[("E001", "portion-mark-in-banner")])];
2391        let err = canonicalize_rule_overrides(&mut config, &sets).unwrap_err();
2392        match err {
2393            EngineConstructionError::UnknownRuleOverride { key, did_you_mean } => {
2394                assert_eq!(key, "E00l");
2395                assert_eq!(
2396                    did_you_mean.as_deref(),
2397                    Some("E001"),
2398                    "single-character typo should suggest the canonical ID"
2399                );
2400            }
2401            other => panic!("expected UnknownRuleOverride, got {other:?}"),
2402        }
2403    }
2404
2405    #[test]
2406    fn canonicalize_rejects_unknown_key_without_suggestion_when_nothing_close() {
2407        // No candidate is within edit distance 3, so did_you_mean must be None
2408        // — a nonsense suggestion is worse than no suggestion.
2409        let mut config = config_with_overrides(&[("totally-made-up-rule-name", "error")]);
2410        let sets = vec![named_rule_set(&[("E001", "portion-mark-in-banner")])];
2411        let err = canonicalize_rule_overrides(&mut config, &sets).unwrap_err();
2412        match err {
2413            EngineConstructionError::UnknownRuleOverride { key, did_you_mean } => {
2414                assert_eq!(key, "totally-made-up-rule-name");
2415                assert!(
2416                    did_you_mean.is_none(),
2417                    "distant misses must not emit a suggestion; got {did_you_mean:?}"
2418                );
2419            }
2420            other => panic!("expected UnknownRuleOverride, got {other:?}"),
2421        }
2422    }
2423
2424    #[test]
2425    fn canonicalize_rejects_conflicting_id_and_name_forms_with_different_severity() {
2426        let mut config =
2427            config_with_overrides(&[("E001", "warn"), ("portion-mark-in-banner", "error")]);
2428        let sets = vec![named_rule_set(&[("E001", "portion-mark-in-banner")])];
2429        let err = canonicalize_rule_overrides(&mut config, &sets).unwrap_err();
2430        match err {
2431            EngineConstructionError::ConflictingRuleOverride {
2432                rule_id,
2433                keys,
2434                severities,
2435            } => {
2436                assert_eq!(rule_id, "E001");
2437                // HashMap iteration order isn't deterministic — verify by set.
2438                let k: std::collections::HashSet<&str> = keys.iter().map(|s| s.as_str()).collect();
2439                assert!(k.contains("E001"));
2440                assert!(k.contains("portion-mark-in-banner"));
2441                let s: std::collections::HashSet<&str> =
2442                    severities.iter().map(|s| s.as_str()).collect();
2443                assert!(s.contains("warn"));
2444                assert!(s.contains("error"));
2445            }
2446            other => panic!("expected ConflictingRuleOverride, got {other:?}"),
2447        }
2448    }
2449
2450    #[test]
2451    fn canonicalize_accepts_duplicate_forms_with_same_severity() {
2452        // A user who writes both `E001 = "warn"` and `portion-mark-in-banner
2453        // = "warn"` (e.g., via copy-paste across layers) is unambiguous and
2454        // should not be punished.
2455        let mut config =
2456            config_with_overrides(&[("E001", "warn"), ("portion-mark-in-banner", "warn")]);
2457        let sets = vec![named_rule_set(&[("E001", "portion-mark-in-banner")])];
2458        canonicalize_rule_overrides(&mut config, &sets)
2459            .expect("duplicate forms with same severity must succeed");
2460        assert_eq!(config.rules.overrides.len(), 1);
2461        assert_eq!(config.rules.overrides.get("E001"), Some(&"warn".to_owned()));
2462    }
2463
2464    #[test]
2465    fn canonicalize_accepts_overrides_across_multiple_rule_sets() {
2466        // Two rule sets registered; aliases from each must resolve.
2467        let mut config = config_with_overrides(&[
2468            ("portion-mark-in-banner", "error"), // name from set A
2469            ("M500", "warn"),                    // ID from set B
2470        ]);
2471        let sets = vec![
2472            named_rule_set(&[("E001", "portion-mark-in-banner")]),
2473            named_rule_set(&[("M500", "some-other-domain-rule")]),
2474        ];
2475        canonicalize_rule_overrides(&mut config, &sets).expect("should succeed");
2476        assert_eq!(
2477            config.rules.overrides.get("E001"),
2478            Some(&"error".to_owned())
2479        );
2480        assert_eq!(config.rules.overrides.get("M500"), Some(&"warn".to_owned()));
2481    }
2482
2483    #[test]
2484    fn canonicalize_empty_overrides_is_noop() {
2485        let mut config = Config::default();
2486        let sets = vec![named_rule_set(&[("E001", "portion-mark-in-banner")])];
2487        canonicalize_rule_overrides(&mut config, &sets).expect("empty overrides must succeed");
2488        assert!(config.rules.overrides.is_empty());
2489    }
2490
2491    #[test]
2492    fn unknown_rule_override_exit_code_is_dataerr() {
2493        let err = EngineConstructionError::UnknownRuleOverride {
2494            key: "E999".into(),
2495            did_you_mean: None,
2496        };
2497        assert_eq!(err.exit_code(), 65, "EX_DATAERR for user-config errors");
2498    }
2499
2500    #[test]
2501    fn conflicting_rule_override_exit_code_is_dataerr() {
2502        let err = EngineConstructionError::ConflictingRuleOverride {
2503            rule_id: "E001".into(),
2504            keys: Box::new(["E001".into(), "portion-mark-in-banner".into()]),
2505            severities: Box::new(["warn".into(), "error".into()]),
2506        };
2507        assert_eq!(err.exit_code(), 65);
2508    }
2509
2510    #[test]
2511    fn rewrite_cycle_exit_code_is_unavailable() {
2512        // Scheme defects (not user-config errors) stay on EX_UNAVAILABLE.
2513        use marque_scheme::CategoryId;
2514        let err = EngineConstructionError::RewriteCycle {
2515            axis: CategoryId(0),
2516            members: Box::new(["a", "b"]),
2517        };
2518        assert_eq!(err.exit_code(), 69);
2519    }
2520
2521    #[test]
2522    fn levenshtein_matches_reference_values() {
2523        // Spot-check against hand-computed distances to catch regressions
2524        // in the DP implementation.
2525        assert_eq!(super::levenshtein("", ""), 0);
2526        assert_eq!(super::levenshtein("E001", "E001"), 0);
2527        assert_eq!(super::levenshtein("E001", "E002"), 1);
2528        assert_eq!(super::levenshtein("E001", "E00l"), 1);
2529        assert_eq!(super::levenshtein("kitten", "sitting"), 3);
2530        assert_eq!(super::levenshtein("", "abc"), 3);
2531        assert_eq!(super::levenshtein("abc", ""), 3);
2532    }
2533
2534    #[test]
2535    fn suggest_closest_prefers_smaller_distance() {
2536        let cands = ["E001", "E002", "E010"];
2537        // "E00l" has dist 1 to E001 and dist 1 to E002 (single substitution),
2538        // and dist 2 to E010. E001 should win the tie-break because it appears
2539        // first among the equally close candidates.
2540        assert_eq!(
2541            super::suggest_closest("E00l", cands.iter().copied()),
2542            Some("E001".to_owned())
2543        );
2544    }
2545
2546    #[test]
2547    fn suggest_closest_returns_none_when_nothing_is_close_enough() {
2548        let cands = ["portion-mark-in-banner", "missing-usa-trigraph"];
2549        // Very short needle with no near neighbors — threshold is 1 for
2550        // length 3, and the closest candidate is many edits away.
2551        assert!(super::suggest_closest("xyz", cands.iter().copied()).is_none());
2552    }
2553}
marque_engine/engine.rs

marque_engine/
engine.rs