marque_engine/engine.rs
1// SPDX-FileCopyrightText: 2026 Knitli Inc.
2//
3// SPDX-License-Identifier: LicenseRef-MarqueLicense-1.0
4
5//! `Engine` — the configured, ready-to-run pipeline.
6
7use crate::clock::{Clock, SystemClock};
8use crate::errors::{EngineConstructionError, EngineError};
9use crate::options::{FixOptions, LintOptions};
10use crate::output::{FixResult, LintResult};
11use crate::recognizer::shift_token_spans;
12use crate::scheduler::schedule_rewrites;
13use aho_corasick::AhoCorasick;
14use marque_capco::CapcoScheme;
15use marque_capco::provenance::DecoderProvenance;
16use marque_config::Config;
17use marque_ism::Span;
18use marque_rules::{
19 AppliedFix, CORRECTIONS_MAP_CITATION, Confidence, Diagnostic, EnginePromotionToken,
20 FixProposal, FixSource, RuleId, RuleSet, Severity,
21};
22use marque_scheme::ambiguity::Parsed;
23use marque_scheme::recognizer::{ParseContext, Recognizer};
24use marque_scheme::{MarkingScheme, RewriteId};
25use std::collections::HashMap;
26use std::panic::AssertUnwindSafe;
27use std::sync::Arc;
28// See note in `options.rs` — `web_time::Instant` is `std::time::Instant`
29// on native and a Performance.now() polyfill on wasm32-unknown-unknown.
30use web_time::Instant;
31
32/// Cooperative-cancellation predicate (spec 005 §R3). Centralizing this
33/// in one helper keeps the wall-clock comparison consistent across every
34/// deadline check site (`lint_with_options` pre-pass, per-candidate,
35/// `fix_inner` post-lint, per-fix-application). The predicate is `now >=
36/// deadline`, so a deadline equal to the current `Instant` triggers
37/// cancellation — the spec's "expired" semantics.
38#[inline]
39fn deadline_expired(deadline: Option<Instant>) -> bool {
40 deadline.is_some_and(|d| Instant::now() >= d)
41}
42
43/// Synthetic rule identifier the engine attaches to decoder-path
44/// `FixSource::DecoderPosterior` diagnostics emitted from
45/// `Engine::lint`. Phase 4 PR-4b mints this identifier so the
46/// recognition-layer rewrite carries a real `RuleId` (rules and
47/// fixes share that requirement) without colliding with any CAPCO
48/// `E### / W### / C### / S###` namespace. A diagnostic stamped
49/// `R001` originates from the decoder, not from a CAPCO rule.
50const DECODER_RULE_ID: &str = "R001";
51
52/// Citation attached to `R001 decoder-recognition` diagnostics. Points
53/// at CAPCO-2016 §A.6 — the canonical-marking-form section the decoder
54/// is enforcing. Per Constitution VIII the citation is verifiable: §A.6
55/// is "(U) Formatting" beginning on page 15 (table of contents,
56/// `crates/capco/docs/CAPCO-2016.md` line 49) and contains the
57/// canonical syntax for portion / banner / CAB markings the decoder
58/// canonicalizes input toward.
59const DECODER_CITATION: &str = "CAPCO-2016 §A.6 p15";
60
61/// Whether to apply fixes or just simulate (dry-run).
62#[derive(Debug, Clone, Copy, PartialEq, Eq)]
63pub enum FixMode {
64 /// Apply fixes to the source text.
65 Apply,
66 /// Simulate fixes — audit stream is identical but source is unchanged.
67 DryRun,
68}
69
70/// Error returned when a caller supplies a runtime confidence threshold
71/// override that is outside the valid `[0.0, 1.0]` range.
72#[derive(Debug, Clone, Copy, PartialEq)]
73pub struct InvalidThreshold(pub f32);
74
75impl std::fmt::Display for InvalidThreshold {
76 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
77 write!(
78 f,
79 "confidence threshold {} is outside [0.0, 1.0] or is NaN",
80 self.0
81 )
82 }
83}
84
85impl std::error::Error for InvalidThreshold {}
86
87/// A configured engine instance.
88pub struct Engine {
89 config: Config,
90 rule_sets: Vec<Box<dyn RuleSet>>,
91 clock: Box<dyn Clock>,
92 /// Corrections map wrapped in Arc once at construction time so that each
93 /// `RuleContext` clone in `lint()` is an O(1) refcount bump, not a
94 /// deep-clone of the entire HashMap.
95 corrections_arc: Option<Arc<HashMap<String, String>>>,
96 /// Pre-built Aho-Corasick automaton for pre-scanner text corrections.
97 /// Built once at construction time from the corrections map (excluding
98 /// no-op and "//" entries). `None` when the corrections map is empty or
99 /// all entries are filtered out.
100 corrections_ac: Option<CachedAhoCorasick>,
101 /// Topologically-sorted rewrite ids, computed once at construction
102 /// time from the scheme's `page_rewrites()` declaration. The order
103 /// satisfies: for every edge `a → b` (rewrite `a` writes a
104 /// category `b` reads), `a` appears before `b`. When dataflow
105 /// edges fully determine the order, FR-007's declaration-order-
106 /// independence guarantee holds; when two rewrites have no edge
107 /// between them, the scheduler breaks the tie by declaration
108 /// order (Kahn's algorithm seeded in declaration order). Empty
109 /// when the scheme declares no rewrites.
110 scheduled_rewrites: Box<[RewriteId]>,
111 /// Recognizer used by `lint()` to resolve each scanner candidate to
112 /// an `IsmAttributes`. Held behind `Arc<dyn Recognizer>` so callers
113 /// can override the default via [`Engine::with_recognizer`] without
114 /// touching the lint loop. Shared across threads unchanged — the
115 /// recognizer trait is `Send + Sync` and `BatchEngine` workers hold
116 /// the same `Arc` reference (Constitution VI, FR-023).
117 ///
118 /// Default: [`StrictOrDecoderRecognizer`] — strict-first dispatch
119 /// with a decoder fallback on strict-parse zero-candidate. The
120 /// decoder recovers mangled markings that are edit-distance-1/2,
121 /// token-reordered, superseded, or case-mangled from a real
122 /// CAPCO-2016 marking. Live-typing surfaces concerned with
123 /// per-keystroke latency are expected to debounce their calls into
124 /// the engine; surfaces that need to pin strict-only behavior (the
125 /// SC-001 interactive-latency benchmark, tests asserting strict
126 /// dispatch) install [`StrictRecognizer`] explicitly via
127 /// [`Engine::with_recognizer`].
128 recognizer: Arc<dyn Recognizer<CapcoScheme>>,
129
130 /// CLI-supplied corpus override (Phase 4 PR-5 / FR-013 / T069).
131 /// Held only behind the `corpus-override` Cargo feature so the
132 /// WASM artifact and the `marque-server` build cannot
133 /// accidentally accept one through any code path.
134 ///
135 /// The decoder does not yet substitute these priors into scoring
136 /// — PR-5 minimal scope wires the surface end-to-end and stamps
137 /// every decoder fix with
138 /// [`marque_rules::FeatureId::CorpusOverrideInEffect`] in the
139 /// audit record so an auditor can identify fixes produced under
140 /// organizational overrides vs. stock priors. The prior-
141 /// substitution wiring is the next-PR step; this field is the
142 /// seam.
143 #[cfg(feature = "corpus-override")]
144 corpus_override: Option<std::sync::Arc<marque_config::corpus_override::CorpusOverride>>,
145}
146
147/// Cached AhoCorasick automaton + the active (key, value) pairs that
148/// correspond to its pattern indices.
149struct CachedAhoCorasick {
150 ac: AhoCorasick,
151 /// Active correction pairs, indexed by `PatternID::as_usize()`.
152 active: Vec<(Box<str>, Box<str>)>,
153}
154
155impl Engine {
156 /// Create a new engine with the given configuration, rule sets, and
157 /// marking scheme.
158 ///
159 /// Runs the page-rewrite scheduler (Kahn's algorithm over the
160 /// scheme's declared `reads` / `writes` axes) once at construction
161 /// time. Cycles and unannotated `Custom` rewrites fail closed with
162 /// [`EngineConstructionError`] rather than degrading at lint time.
163 ///
164 /// Use [`Engine::with_clock`] for deterministic-timestamp testing.
165 pub fn new<S: MarkingScheme>(
166 config: Config,
167 rule_sets: Vec<Box<dyn RuleSet>>,
168 scheme: S,
169 ) -> Result<Self, EngineConstructionError> {
170 Self::with_clock(config, rule_sets, scheme, Box::new(SystemClock))
171 }
172
173 /// Create an engine with a custom clock (for deterministic tests).
174 pub fn with_clock<S: MarkingScheme>(
175 mut config: Config,
176 rule_sets: Vec<Box<dyn RuleSet>>,
177 scheme: S,
178 clock: Box<dyn Clock>,
179 ) -> Result<Self, EngineConstructionError> {
180 // Canonicalize [rules] overrides against the registered rule
181 // set: accept either the rule ID (e.g. "E001") or the rule
182 // name (e.g. "portion-mark-in-banner"), resolve both to the
183 // canonical ID before the engine stores the map, and hard-fail
184 // on any unknown key. See `canonicalize_rule_overrides`.
185 canonicalize_rule_overrides(&mut config, &rule_sets)?;
186
187 let scheduled_rewrites = schedule_rewrites(scheme.page_rewrites())?;
188 // Take ownership of the corrections map instead of cloning —
189 // nothing reads config.corrections after construction.
190 let corrections_arc = if config.corrections.is_empty() {
191 None
192 } else {
193 Some(Arc::new(std::mem::take(&mut config.corrections)))
194 };
195
196 // Pre-build the AhoCorasick automaton for pre-scanner text corrections.
197 // This is O(total pattern bytes) and done once, not per-lint call.
198 let corrections_ac = corrections_arc.as_ref().and_then(|corrections| {
199 // Sort by key for deterministic pattern ordering — HashMap
200 // iteration order is random (hash seed varies per process),
201 // and AhoCorasick pattern IDs depend on insertion order.
202 let mut active: Vec<(Box<str>, Box<str>)> = corrections
203 .iter()
204 .filter(|(k, v)| k != v && k.as_str() != "//")
205 .map(|(k, v)| (k.as_str().into(), v.as_str().into()))
206 .collect();
207 active.sort_by(|(a, _), (b, _)| a.cmp(b));
208 if active.is_empty() {
209 return None;
210 }
211 let patterns: Vec<&str> = active.iter().map(|(k, _)| k.as_ref()).collect();
212 match AhoCorasick::new(&patterns) {
213 Ok(ac) => Some(CachedAhoCorasick { ac, active }),
214 Err(e) => {
215 tracing::warn!(
216 "failed to build AhoCorasick automaton for corrections map \
217 ({} patterns): {e}; pre-scanner text corrections disabled",
218 patterns.len()
219 );
220 None
221 }
222 }
223 });
224
225 Ok(Self {
226 config,
227 rule_sets,
228 clock,
229 corrections_arc,
230 corrections_ac,
231 scheduled_rewrites,
232 recognizer: Arc::new(crate::decoder::StrictOrDecoderRecognizer::new()),
233 #[cfg(feature = "corpus-override")]
234 corpus_override: None,
235 })
236 }
237
238 /// The topologically-sorted rewrite order computed by the scheduler
239 /// at construction time.
240 ///
241 /// Exposed for diagnostic / test inspection. Per-document lint does
242 /// not re-sort; this slice is the canonical order every page roll-up
243 /// walks.
244 pub fn scheduled_rewrites(&self) -> &[RewriteId] {
245 &self.scheduled_rewrites
246 }
247
248 /// Override the engine's recognizer. The default installed by
249 /// [`Engine::new`] is [`StrictOrDecoderRecognizer`] (strict-first,
250 /// decoder fallback). Callers that need to pin a different dispatch
251 /// — most commonly [`StrictRecognizer`] for the SC-001 interactive-
252 /// latency benchmark or tests asserting strict-only behavior —
253 /// install one explicitly here.
254 ///
255 /// Returns the engine by value so callers can chain:
256 ///
257 /// ```ignore
258 /// let engine = Engine::new(config, rules, scheme)?
259 /// .with_recognizer(Arc::new(StrictRecognizer::new()));
260 /// ```
261 #[must_use = "with_recognizer returns a new Engine; the returned value must be bound for the override to take effect"]
262 pub fn with_recognizer(mut self, recognizer: Arc<dyn Recognizer<CapcoScheme>>) -> Self {
263 self.recognizer = recognizer;
264 self
265 }
266
267 /// Install a CLI-supplied corpus override. Only available when
268 /// the engine is built with the `corpus-override` Cargo feature
269 /// (CLI-only — `marque-server` rejects override input on every
270 /// channel per T066, and the WASM crate cannot enable the feature
271 /// at all per T067).
272 ///
273 /// Phase 4 PR-5 minimal scope: the engine retains the override
274 /// for audit-annotation purposes only. Every subsequent decoder-
275 /// path fix produced by [`Engine::lint`] gets a
276 /// [`FeatureId::CorpusOverrideInEffect`] feature contribution
277 /// appended to its `Confidence.features` so an auditor can
278 /// identify fixes produced under organizational overrides vs.
279 /// stock priors. Substituting the override priors into the
280 /// decoder's prior-table lookup is the next-PR step.
281 #[cfg(feature = "corpus-override")]
282 #[must_use = "with_corpus_override returns a new Engine; the result must be bound to take effect — `engine.with_corpus_override(o)` alone leaves the engine without an override installed"]
283 pub fn with_corpus_override(
284 mut self,
285 override_data: std::sync::Arc<marque_config::corpus_override::CorpusOverride>,
286 ) -> Self {
287 self.corpus_override = Some(override_data);
288 self
289 }
290
291 /// Whether a corpus override is in effect for this engine.
292 ///
293 /// Returns `false` unconditionally when the `corpus-override`
294 /// Cargo feature is not compiled in — the WASM and server
295 /// builds therefore cannot observe a `true` here regardless of
296 /// what any caller passes through other surfaces. Callers that
297 /// need to thread the flag into audit-record construction (the
298 /// private `build_decoder_diagnostic` helper inside this module)
299 /// should go through this method rather than poking at the
300 /// field directly.
301 #[inline]
302 pub fn corpus_override_active(&self) -> bool {
303 #[cfg(feature = "corpus-override")]
304 {
305 self.corpus_override.is_some()
306 }
307 #[cfg(not(feature = "corpus-override"))]
308 {
309 false
310 }
311 }
312
313 /// Lint a UTF-8 text buffer. Returns diagnostics without modifying input.
314 ///
315 /// Back-compat shim over [`Engine::lint_with_options`] — calling
316 /// `lint(src)` is equivalent to
317 /// `lint_with_options(src, &LintOptions::default())`. New code that
318 /// needs a deadline (spec 005 §R3) should call the `_with_options`
319 /// variant directly.
320 pub fn lint(&self, source: &[u8]) -> LintResult {
321 self.lint_with_options(source, &LintOptions::default())
322 }
323
324 /// Lint with per-call options (spec 005 §R2).
325 ///
326 /// Phase 2 honors `opts.deadline` via cooperative cancellation
327 /// (spec §R3): a pre-pass check returns immediately on an
328 /// already-expired deadline, and a per-candidate check inside
329 /// the rule loop breaks out as soon as the deadline passes. The
330 /// returned `LintResult` carries `truncated: bool` together with
331 /// `candidates_processed` / `candidates_total` so the caller can
332 /// distinguish a complete pass from a deadline-bounded partial
333 /// pass.
334 ///
335 /// Granularity: the engine checks the deadline at candidate
336 /// boundaries (between scanner-emitted candidates), not inside
337 /// any individual rule's `check`. A pathologically slow rule
338 /// running on one large candidate can therefore overrun the
339 /// deadline by the time that one rule takes; this is the spec
340 /// §R3 trade-off — a finer-grained check inside `Rule::check`
341 /// would require a deadline-aware rule trait.
342 pub fn lint_with_options(&self, source: &[u8], opts: &LintOptions) -> LintResult {
343 use marque_core::Scanner;
344 use marque_ism::{MarkingType, PageContext};
345 use marque_rules::RuleContext;
346
347 // T007: pre-pass deadline check. An already-expired deadline
348 // returns a fully-truncated empty result before the scanner
349 // runs at all, preserving the spec invariant that the
350 // expired path is observable in zero work.
351 if deadline_expired(opts.deadline) {
352 return LintResult {
353 truncated: true,
354 ..Default::default()
355 };
356 }
357
358 let candidates = Scanner::scan(source);
359 // T009: candidates_total is fixed once the scanner has
360 // produced the candidate stream. It is independent of how
361 // many candidates the rule loop ultimately processes — the
362 // delta against `candidates_processed` is what makes
363 // truncation observable to the caller (R3). On a complete
364 // pass these are equal; on a deadline-bounded pass the
365 // function returns early from inside the loop with the
366 // partial `candidates_processed`, so the post-loop
367 // `LintResult` construction below is reached ONLY on
368 // non-truncated completion.
369 let candidates_total = candidates.len();
370 let mut candidates_processed: usize = 0;
371
372 // corrections_arc was built once at Engine construction; each clone here
373 // is an O(1) refcount bump.
374 let corrections_arc = self.corrections_arc.clone();
375
376 let mut diagnostics = Vec::new();
377 // Build page context by accumulating portion markings in document order.
378 // Banner and CAB rules receive this context so they can validate the
379 // observed banner against the expected composite. Phase 3 wires the
380 // page-break reset below — the scanner emits a `MarkingType::PageBreak`
381 // candidate at every form-feed and at every `\n\n\n+` run; on each
382 // such candidate we drop the accumulator and start a fresh page.
383 let mut page_context = PageContext::new();
384 // Cache the current Arc<PageContext> so that consecutive banner/CAB
385 // candidates on the same page share a single allocation. The cache is
386 // invalidated (set to None) whenever a new portion is accumulated or
387 // a page break resets the context.
388 let mut page_context_arc: Option<Arc<PageContext>> = None;
389
390 // FR-011: per-page strict classification floor. Tracks the
391 // highest classification rank produced by the strict path on
392 // the current page (`marque_ism::Classification as u8`,
393 // Unclassified=0 … TopSecret=4). Threaded into
394 // `ParseContext::classification_floor` so the decoder rejects
395 // any candidate at a strictly-lower level on the same page.
396 // Reset on `MarkingType::PageBreak` per Constitution VI's
397 // "PageContext resets at scanner-emitted page-break candidates"
398 // invariant. Updated *only* by classifications drawn from
399 // strict-path recognitions — decoder-recovered markings do not
400 // raise the floor for themselves (otherwise a misrecognition
401 // would self-justify by raising the floor it then clears).
402 let mut classification_floor: Option<u8> = None;
403
404 for candidate in &candidates {
405 // T008: per-candidate deadline check. Checking at the top
406 // of the loop (before any per-candidate work — including
407 // a page-break reset) guarantees the abort happens
408 // between candidates, never partway through the rule
409 // pipeline. On expiry we return immediately so the
410 // post-loop corrections-map AhoCorasick pass — which is
411 // O(source bytes) — does NOT overrun the deadline.
412 // Returning here also gives the spec-correct
413 // `truncated/processed/total` triple to the caller
414 // without falling through the rest of the function.
415 if deadline_expired(opts.deadline) {
416 return LintResult {
417 diagnostics,
418 truncated: true,
419 candidates_processed,
420 candidates_total,
421 ..Default::default()
422 };
423 }
424
425 // T009: count every candidate the engine started
426 // processing past the deadline boundary. The increment
427 // sits ABOVE the early-`continue` paths below
428 // (page-break reset, empty span, ambiguous recognition)
429 // so a complete pass always reports
430 // `candidates_processed == candidates_total` — the
431 // documented contract for a non-truncated `LintResult`.
432 // A pass that aborts mid-loop reports `processed <
433 // total` with the count of candidates we got past the
434 // per-candidate check.
435 candidates_processed += 1;
436
437 // Page-break candidates are scanner-emitted boundaries with no
438 // parsable content. Reset the context BEFORE attempting to parse
439 // — otherwise the parser's MalformedMarking error would skip the
440 // continue and leave us accumulating across pages.
441 if candidate.kind == MarkingType::PageBreak {
442 page_context = PageContext::new();
443 page_context_arc = None;
444 classification_floor = None;
445 continue;
446 }
447
448 // Parse context built per-candidate so the floor accumulated
449 // earlier on the page reaches the recognizer. `strict_evidence
450 // = false` permits the dispatcher
451 // (`StrictOrDecoderRecognizer`, the default) to fall back to
452 // the decoder on strict-parse zero-candidate. The
453 // `StrictRecognizer` ignores this flag entirely; consumers
454 // that pin strict-only behavior install it via
455 // [`Engine::with_recognizer`].
456 //
457 // `preceded_by_whitespace` is computed against the source
458 // buffer here — the decoder receives only the candidate
459 // slice and cannot recover the surrounding context on its
460 // own. Used downstream to suppress prose-glue false
461 // positives like `letter(s)` / `loss(s)` /
462 // `function(c)`. Start-of-buffer counts as whitespace by
463 // the `ParseContext` convention.
464 let preceded_by_whitespace = match candidate.span.start.checked_sub(1) {
465 None => true,
466 Some(prev_idx) => source
467 .get(prev_idx)
468 .map(|b| b.is_ascii_whitespace())
469 .unwrap_or(true),
470 };
471 let parse_cx = ParseContext {
472 strict_evidence: false,
473 zone: None,
474 position: None,
475 classification_floor,
476 as_of: None,
477 preceded_by_whitespace,
478 };
479
480 // Route each candidate's bytes through the recognizer. Zero-
481 // candidate `Ambiguous` means "no plausible interpretation" —
482 // skip, same as a strict-path parser error would in the old
483 // flow (foundational-plan line 609-612). `Unambiguous` returns
484 // a `CapcoMarking` whose `token_spans` are zero-origin relative
485 // to the candidate bytes; shift them back to source-relative
486 // offsets before rules see them.
487 let start = candidate.span.start.min(source.len());
488 let end = candidate.span.end.min(source.len());
489 if start >= end {
490 continue;
491 }
492 let bytes = &source[start..end];
493 let Parsed::Unambiguous(mut marking) = self.recognizer.recognize(bytes, &parse_cx)
494 else {
495 continue;
496 };
497 shift_token_spans(&mut marking.0, start);
498 // Capture the decoder-provenance side channel before
499 // collapsing the marking onto its `IsmAttributes` payload.
500 // Strict-path recognizers leave this `None`; the decoder
501 // populates it with the canonical bytes / posterior /
502 // features the engine needs to mint a
503 // `FixSource::DecoderPosterior` diagnostic below.
504 let provenance = marking.1.take();
505 let attrs = marking.0;
506
507 // FR-011 strict-floor accumulator: only strict-path
508 // recognitions raise the floor. A decoder-path
509 // recognition (provenance.is_some()) does not — we cannot
510 // let a probabilistic recovery self-justify by raising
511 // the threshold it then clears.
512 if provenance.is_none() {
513 if let Some(level) = attrs
514 .classification
515 .as_ref()
516 .map(|c| c.effective_level() as u8)
517 {
518 classification_floor = Some(match classification_floor {
519 Some(prev) => prev.max(level),
520 None => level,
521 });
522 }
523 }
524
525 // Decoder-path emission (T068): when the recognizer carries
526 // provenance, the recognition went through the decoder
527 // fallback. Synthesize an R001 `decoder-recognition`
528 // diagnostic whose fix rewrites the original mangled bytes
529 // to the decoder's canonical form, with `FixSource::DecoderPosterior`
530 // and a populated `Confidence` (`recognition < 1.0`,
531 // `runner_up_ratio = Some(r)`, non-empty `features`). The
532 // fix participates in the regular confidence-threshold
533 // gate inside `Engine::fix_inner`.
534 if let Some(prov) = provenance {
535 let span = Span::new(start, end);
536 if let Some(diagnostic) = build_decoder_diagnostic(
537 span,
538 bytes,
539 &prov,
540 candidate.kind,
541 self.corpus_override_active(),
542 ) {
543 diagnostics.push(diagnostic);
544 }
545 }
546
547 // Accumulate portions before running banner/CAB rules so that
548 // when we reach a banner candidate the context already reflects
549 // all preceding portion data.
550 if candidate.kind == MarkingType::Portion {
551 page_context.add_portion(attrs.clone());
552 // Invalidate the cached Arc so the next banner/CAB gets a
553 // fresh snapshot. We rebuild it lazily below.
554 page_context_arc = None;
555 }
556
557 // Phase 3: zone and position are Option-typed and stay None
558 // until a structural scanner pass can prove them. The previous
559 // hardcoded `Zone::Body`/`DocumentPosition::Body` was a silent
560 // lie to any future rule that read them.
561 let ctx_page = if candidate.kind != MarkingType::Portion && !page_context.is_empty() {
562 // Lazily wrap the accumulated context in an Arc once per
563 // page-context snapshot; subsequent banner/CAB candidates on
564 // the same page clone only the cheap Arc pointer.
565 Some(
566 page_context_arc
567 .get_or_insert_with(|| Arc::new(page_context.clone()))
568 .clone(),
569 )
570 } else {
571 None
572 };
573 let ctx = RuleContext {
574 marking_type: candidate.kind,
575 zone: None,
576 position: None,
577 page_context: ctx_page,
578 corrections: corrections_arc.clone(),
579 };
580 for rule_set in &self.rule_sets {
581 for rule in rule_set.rules() {
582 // Skip rules that are configured as Off.
583 let configured_severity = self
584 .config
585 .rules
586 .overrides
587 .get(rule.id().as_str())
588 .and_then(|s| Severity::parse_config(s))
589 .unwrap_or(rule.default_severity());
590
591 if configured_severity == Severity::Off {
592 continue;
593 }
594
595 // Whitepaper §6.3 / gap register #10: a buggy rule
596 // that constructs an out-of-range `Confidence`
597 // panics inside `FixProposal::new`. Without this
598 // wrapper, that panic propagates out of `lint()`
599 // and aborts the entire document — turning one
600 // rule's defect into a service outage. Catch the
601 // unwind, log a warning naming the rule, and
602 // skip it. Other rules and other candidates keep
603 // running.
604 //
605 // `AssertUnwindSafe` is a deliberate best-effort
606 // containment — `Send + Sync` (which `Rule`
607 // requires) is NOT the same property as
608 // `UnwindSafe`. The justification rests on the
609 // engine's stateless-rule contract
610 // (`crates/rules/src/lib.rs` `Rule` doc comments):
611 // `check()` must not mutate state visible across
612 // invocations. A rule that violates that contract
613 // via interior mutability could in principle
614 // observe a torn invariant after a panic — but the
615 // alternative is to abort the whole `lint()` on
616 // any rule defect, which is the bug this wrapper
617 // exists to fix. Containing the failure to the
618 // offending rule is strictly better than letting
619 // it cascade. Diagnostics we'd otherwise have
620 // appended on success are built fresh inside the
621 // closure, so they don't pollute the outer
622 // accumulator on the panic path.
623 //
624 // Requires `panic = "unwind"` in the release
625 // profile (`Cargo.toml`). With `panic = "abort"`
626 // the panic terminates the process before this
627 // catch can fire.
628 let rule_id = rule.id();
629 let catch_result =
630 std::panic::catch_unwind(AssertUnwindSafe(|| rule.check(&attrs, &ctx)));
631 let mut diags = match catch_result {
632 Ok(d) => d,
633 Err(payload) => {
634 let msg = panic_payload_to_string(&payload);
635 tracing::warn!(
636 target: "marque_engine::rule_panic",
637 rule = rule_id.as_str(),
638 error = %msg,
639 "rule check panicked; skipping this rule for the current candidate"
640 );
641 Vec::new()
642 }
643 };
644 // Apply configured severity override.
645 for d in &mut diags {
646 d.severity = configured_severity;
647 }
648 diagnostics.extend(diags);
649 }
650 }
651 }
652
653 // Pre-scanner text corrections: scan the raw source for
654 // corrections-map keys that the scanner missed (e.g., "SERCET" is
655 // not a known classification prefix, so the scanner never detects
656 // "SERCET//NF" as a candidate, and C001 never sees the token).
657 //
658 // This pass emits C001 diagnostics for raw-text matches that don't
659 // overlap with any C001 diagnostic already produced by the rule
660 // pipeline above. Spans reference the original source buffer.
661 if let Some(cached) = &self.corrections_ac {
662 let c001_severity = self
663 .config
664 .rules
665 .overrides
666 .get("C001")
667 .and_then(|s| Severity::parse_config(s))
668 .unwrap_or(Severity::Fix);
669
670 if c001_severity != Severity::Off {
671 // Collect spans already covered by rule-pipeline C001.
672 let existing_c001_spans: std::collections::HashSet<Span> = diagnostics
673 .iter()
674 .filter(|d| d.rule.as_str() == "C001")
675 .map(|d| d.span)
676 .collect();
677
678 // Use the pre-built AhoCorasick automaton to scan the full
679 // source in a single O(n + m) pass. The automaton and its
680 // active pairs were built once at Engine construction time.
681 for mat in cached.ac.find_iter(source) {
682 let span = Span::new(mat.start(), mat.end());
683 let (ref key, ref value) = cached.active[mat.pattern().as_usize()];
684
685 // Skip if the rule pipeline already produced a C001
686 // diagnostic for this exact span.
687 if !existing_c001_spans.contains(&span) {
688 let proposal = FixProposal::new(
689 RuleId::new("C001"),
690 FixSource::CorrectionsMap,
691 span,
692 key.as_ref(),
693 value.as_ref(),
694 marque_rules::Confidence::strict(1.0),
695 None,
696 );
697 diagnostics.push(Diagnostic::new(
698 RuleId::new("C001"),
699 c001_severity,
700 span,
701 format!("corrections map: {key:?} → {value:?}"),
702 CORRECTIONS_MAP_CITATION,
703 Some(proposal),
704 ));
705 }
706 }
707 }
708 }
709
710 // Suggest-don't-fix channel post-pass (issue #235 / #186 PR-3).
711 //
712 // Only `Severity::Fix` diagnostics are rewritten — those are
713 // the ones whose authoring rule expects auto-application. A
714 // sub-threshold `FixProposal` attached to a `Fix`-severity
715 // diagnostic stays observable in lint output by being
716 // demoted to `Severity::Suggest` instead of being silently
717 // dropped at the fix-collection threshold gate.
718 //
719 // Error/Warn/Info rules with sub-threshold fixes keep their
720 // severity (the violation IS what the rule says it is; only
721 // the suggested replacement is uncertain) and the fix is
722 // silently dropped at the apply gate as before. Suggest-channel
723 // reuse for Error/Warn fixes is out of scope for PR-C — making
724 // a normative ordering rule like E003 CI-silent because its
725 // fix confidence sits below threshold would be a behavioral
726 // regression.
727 //
728 // This unifies two emission paths into a single visible
729 // channel for `Fix`-severity rules:
730 //
731 // - Rules that explicitly emit at `Severity::Suggest`
732 // (e.g., `S004 rel-to-trigraph-suggest`).
733 // - `Fix`-severity rules whose proposal confidence falls
734 // below the configured threshold (decoder-sourced fixes
735 // that didn't quite clear the bar are the canonical case).
736 //
737 // The fix stays attached because the renderer surfaces the
738 // candidate replacement; only the severity is changed. The
739 // constitutional V audit-content-ignorance invariant is
740 // preserved — no fields are modified except `severity`,
741 // which is metadata not document content.
742 //
743 // `Engine::fix_inner` re-applies the threshold gate on its own
744 // (and now also filters by `severity != Suggest`), so a
745 // diagnostic rewritten here will not be promoted to an
746 // `AppliedFix` even if a later threshold-override raises the
747 // floor.
748 let threshold = self.config.confidence_threshold();
749 for d in &mut diagnostics {
750 if d.severity != Severity::Fix {
751 continue;
752 }
753 let Some(fix) = d.fix.as_ref() else { continue };
754 if fix.confidence.combined() < threshold {
755 d.severity = Severity::Suggest;
756 }
757 }
758
759 LintResult {
760 diagnostics,
761 truncated: false,
762 candidates_processed,
763 candidates_total,
764 ..Default::default()
765 }
766 }
767
768 /// Lint and apply fixes. Returns fixed source and audit log.
769 ///
770 /// Fix application order follows FR-016: `(span.end DESC, span.start DESC,
771 /// rule_id ASC, replacement ASC)` so reverse-byte application preserves
772 /// earlier-span offsets and equal-span ties break deterministically.
773 ///
774 /// Uses the confidence threshold configured in the engine's `Config`.
775 /// To supply a per-call override (e.g., from a `--confidence` CLI flag
776 /// or an HTTP request field), use [`Engine::fix_with_threshold`] or
777 /// [`Engine::fix_with_options`].
778 ///
779 /// Back-compat shim over [`Engine::fix_with_options`] — `fix(src, mode)`
780 /// is equivalent to `fix_with_options(src, mode, &FixOptions::default())`
781 /// (no deadline, no threshold override). Both invariants make the
782 /// `expect` here unreachable: the default options carry no deadline so
783 /// `EngineError::DeadlineExceeded` cannot fire, and the config
784 /// threshold is pre-validated at load time so
785 /// `EngineError::InvalidThreshold` cannot fire.
786 pub fn fix(&self, source: &[u8], mode: FixMode) -> FixResult {
787 self.fix_with_options(source, mode, &FixOptions::default())
788 .expect(
789 "fix() default options cannot fail: no deadline + pre-validated config threshold",
790 )
791 }
792
793 /// Lint and apply fixes using an optional per-call confidence threshold.
794 ///
795 /// When `threshold_override` is `Some`, it replaces the config-level
796 /// threshold for this call only and is validated against `[0.0, 1.0]`.
797 /// When `None`, the engine falls back to `Config::confidence_threshold`.
798 ///
799 /// This signature is preserved for back-compat. New callers should
800 /// prefer [`Engine::fix_with_options`], which carries the deadline
801 /// surface alongside the threshold override.
802 pub fn fix_with_threshold(
803 &self,
804 source: &[u8],
805 mode: FixMode,
806 threshold_override: Option<f32>,
807 ) -> Result<FixResult, InvalidThreshold> {
808 let opts = FixOptions {
809 threshold_override,
810 ..Default::default()
811 };
812 match self.fix_with_options(source, mode, &opts) {
813 Ok(result) => Ok(result),
814 Err(EngineError::InvalidThreshold(it)) => Err(it),
815 // No caller can reach this arm: `fix_with_threshold`'s
816 // public signature does not accept a deadline, so the
817 // `FixOptions` we built above has `deadline: None`. A
818 // future signature change that introduces one would have
819 // to remove this `unreachable!` deliberately.
820 Err(EngineError::DeadlineExceeded { .. }) => {
821 unreachable!("fix_with_threshold cannot set a deadline through its signature")
822 }
823 }
824 }
825
826 /// Lint and apply fixes with per-call options (spec 005 §R2).
827 ///
828 /// Phase 2 honors `opts.deadline` via cooperative cancellation
829 /// (spec §R3). Asymmetric response per §R4 / Constitution V
830 /// Principle V (audit-record integrity): a deadline expiring at
831 /// any point during the fix path returns
832 /// `Err(EngineError::DeadlineExceeded { partial_lint })` rather
833 /// than a partial `FixResult`. The `partial_lint` carries
834 /// whatever the lint phase had produced before the deadline
835 /// fired (or a fully-truncated lint when the deadline was
836 /// already expired on entry); no half-applied fix is ever
837 /// emitted into the audit stream.
838 ///
839 /// `opts.threshold_override` is honored from Phase 1 onward; an
840 /// out-of-range / NaN value is rejected as
841 /// `EngineError::InvalidThreshold` before any work runs.
842 pub fn fix_with_options(
843 &self,
844 source: &[u8],
845 mode: FixMode,
846 opts: &FixOptions,
847 ) -> Result<FixResult, EngineError> {
848 let threshold = match opts.threshold_override {
849 Some(value) => {
850 if !(0.0..=1.0).contains(&value) || value.is_nan() {
851 return Err(EngineError::InvalidThreshold(InvalidThreshold(value)));
852 }
853 value
854 }
855 None => self.config.confidence_threshold(),
856 };
857
858 self.fix_inner(source, mode, threshold, opts.deadline)
859 }
860
861 fn fix_inner(
862 &self,
863 source: &[u8],
864 mode: FixMode,
865 threshold: f32,
866 deadline: Option<Instant>,
867 ) -> Result<FixResult, EngineError> {
868 use std::collections::HashSet;
869
870 // Two-pass fix strategy for pre-scanner text corrections.
871 //
872 // Pass 1: lint the original source. The pre-scanner text scan may
873 // produce C001 diagnostics for corrections-map matches the scanner
874 // missed (e.g., "SERCET" is not a known classification prefix).
875 // Apply those C001 fixes to produce an intermediate source.
876 //
877 // Pass 2: re-lint the intermediate source. The scanner now detects
878 // the corrected marking (e.g., "SECRET//NF") and additional rules
879 // fire (e.g., E001 on NF→NOFORN). Apply those fixes on top.
880 //
881 // Without this, the spec scenario "SERCET//NF → SECRET//NOFORN"
882 // would stop at "SECRET//NF".
883 //
884 // T010: deadline propagates to every internal lint pass. An
885 // expired deadline at lint time produces a truncated lint, and
886 // the post-lint check below converts that into the asymmetric
887 // `Err(DeadlineExceeded { partial_lint })` shape per spec §R4
888 // (Constitution V Principle V — no partial `FixResult` leaks
889 // into the audit stream).
890 let lint_opts = LintOptions {
891 deadline,
892 ..Default::default()
893 };
894 let lint1 = self.lint_with_options(source, &lint_opts);
895 if deadline_expired(deadline) {
896 return Err(EngineError::DeadlineExceeded {
897 partial_lint: lint1,
898 });
899 }
900 let (effective_source, pass1_applied) =
901 self.apply_text_corrections(source, &lint1, threshold, mode);
902
903 let lint = if !pass1_applied.is_empty() {
904 // Re-lint the corrected source so the scanner picks up newly-valid markings.
905 self.lint_with_options(&effective_source, &lint_opts)
906 } else {
907 lint1
908 };
909
910 // Post-lint deadline check: if the deadline expired during
911 // either pass-1 or pass-2 lint (or during text-correction
912 // application between them), bail out before building any
913 // fix entries. `partial_lint` carries whatever the lint phase
914 // produced — including `truncated: true` when applicable.
915 if deadline_expired(deadline) {
916 return Err(EngineError::DeadlineExceeded { partial_lint: lint });
917 }
918
919 // Suggest-don't-fix channel: `Severity::Suggest` is a hard
920 // exclusion from auto-apply by construction. The lint
921 // post-pass already rewrites below-threshold proposals to
922 // `Suggest`, but explicit `Suggest` rules (e.g., S004) can
923 // also emit fixes that clear the threshold yet must NOT be
924 // applied. This filter handles both cases uniformly.
925 let mut fixes: Vec<_> = lint
926 .diagnostics
927 .iter()
928 .filter(|d| d.severity != Severity::Suggest)
929 .filter_map(|d| d.fix.as_ref())
930 .filter(|f| f.confidence.combined() >= threshold)
931 .filter(|f| !f.span.is_empty())
932 .collect();
933
934 // FR-016: deterministic total-order fix application.
935 // Sort by (span.end DESC, span.start DESC, rule_id ASC, replacement ASC).
936 fixes.sort_by(|a, b| {
937 b.span
938 .end
939 .cmp(&a.span.end)
940 .then(b.span.start.cmp(&a.span.start))
941 .then(a.rule.cmp(&b.rule))
942 .then(a.replacement.cmp(&b.replacement))
943 });
944
945 // C-1: overlap guard. After the FR-016 sort, two fixes can still
946 // touch the same byte range if multiple rules emit a fix for the
947 // same span (or overlapping spans). Applying both via `splice`
948 // would silently corrupt the byte stream. We keep the first fix
949 // per span (which under FR-016 ordering is deterministic) and
950 // surface the dropped fixes through `remaining_diagnostics`.
951 //
952 // The walk is over fixes in reverse-end order, so a fix is kept
953 // only if its `span.end` is at or below the previous kept fix's
954 // `span.start` — i.e., strictly to the left, no overlap.
955 // Clone the kept fixes into owned `FixProposal`s so the
956 // borrow on `lint.diagnostics` ends with `fixes`. That
957 // matters for T011: the per-fix deadline-bail path needs to
958 // move `lint` into `EngineError::DeadlineExceeded`, which is
959 // only legal once nothing inside the body still references
960 // it. The clone count is bounded by the number of kept
961 // fixes (after the C-1 dedup), which is small in practice.
962 let mut kept_fixes: Vec<FixProposal> = Vec::with_capacity(fixes.len());
963 let mut next_window_end: Option<usize> = None;
964 for fix in &fixes {
965 let fits = match next_window_end {
966 Some(boundary) => fix.span.end <= boundary,
967 None => true,
968 };
969 if fits {
970 next_window_end = Some(fix.span.start);
971 kept_fixes.push((*fix).clone());
972 }
973 }
974 drop(fixes); // release the iter borrow on `lint.diagnostics`
975
976 // M-4: hold the classifier id in an `Arc<str>` so cloning into each
977 // applied-fix audit record is an O(1) refcount bump rather than a
978 // full string copy per fix.
979 let classifier_id: Option<std::sync::Arc<str>> = self
980 .config
981 .user
982 .classifier_id
983 .as_deref()
984 .map(std::sync::Arc::from);
985 let dry_run = mode == FixMode::DryRun;
986 let now = self.clock.now();
987
988 // H-7: applied-fix lookup is keyed by (RuleId, Span). Use a HashSet
989 // so the per-diagnostic filter at the bottom of this function is
990 // O(1) per query instead of O(n) over a Vec.
991 let mut applied_keys: HashSet<(RuleId, Span)> = HashSet::with_capacity(kept_fixes.len());
992 let mut applied: Vec<AppliedFix> = Vec::with_capacity(kept_fixes.len());
993
994 // T011: per-fix-application deadline check. The check sits
995 // at the top of each iteration so the abort happens between
996 // fixes — the audit-record integrity invariant
997 // (Constitution V Principle V) is preserved because we
998 // never construct a half-applied `FixResult`. If a fix has
999 // already been applied to `buf` and `applied`, we drop both
1000 // and surface the asymmetric `Err(DeadlineExceeded)` shape;
1001 // the partial buffer is intentionally discarded so no
1002 // partially-fixed bytes can leak to a caller.
1003 //
1004 // Pre-apply check: catch a deadline that expired during
1005 // fix collection / sort / dedup BEFORE we clone
1006 // `effective_source` into `buf` (which is O(source bytes)
1007 // and pointless if we're about to drop the buffer on the
1008 // floor). On large inputs the clone alone can be the
1009 // dominant cost; the post-lint check above doesn't cover
1010 // it because the sort + dedup phase between the two adds
1011 // its own latency on documents with many fixes.
1012 if deadline_expired(deadline) {
1013 return Err(EngineError::DeadlineExceeded { partial_lint: lint });
1014 }
1015
1016 // Only allocate the output buffer when we actually need to
1017 // mutate it. Dry-run returns the original source verbatim.
1018 let mut deadline_aborted = false;
1019 let output = match mode {
1020 FixMode::Apply => {
1021 // Forward-pass buffer construction: O(source_len + Σ replacement_lens).
1022 //
1023 // `kept_fixes` is in (span.end DESC, span.start DESC) order
1024 // from the FR-016 sort (line ~936) and C-1 dedup walk.
1025 // Iterating in reverse gives ascending span.end / span.start
1026 // order so we can copy each gap and replacement in a single
1027 // left-to-right pass over `effective_source`.
1028 //
1029 // This replaces the previous `Vec::splice`-per-fix approach
1030 // that was O(N × M): each splice shifted every byte after the
1031 // splice point, so N evenly-spaced fixes on an M-byte buffer
1032 // cost O(N × M / 2) total — quadratic when fix density scales
1033 // with document size.
1034 //
1035 // After C-1 has guaranteed `kept_fixes` is non-overlapping in
1036 // reverse-end order, ascending order is also non-overlapping
1037 // (the property does not depend on traversal direction), so the
1038 // forward walk is safe.
1039 let extra: usize = kept_fixes
1040 .iter()
1041 .map(|f| {
1042 // `saturating_sub` gives the per-fix growth contribution
1043 // (0 when the replacement is shorter than the span).
1044 // The result is an upper-bound preallocation: fixes that
1045 // shrink the buffer contribute 0 here, so the true net
1046 // change may be smaller. This is intentional — it avoids
1047 // the sign-handling complexity of a true net delta while
1048 // still preventing the O(log N) reallocation cascade that
1049 // would occur for repeated grow-by-one insertions.
1050 f.replacement
1051 .len()
1052 .saturating_sub(f.span.end - f.span.start)
1053 })
1054 .sum();
1055 let mut buf = Vec::with_capacity(effective_source.len() + extra);
1056 let mut last_end = 0usize;
1057 for fix in kept_fixes.iter().rev() {
1058 if deadline_expired(deadline) {
1059 deadline_aborted = true;
1060 break;
1061 }
1062 buf.extend_from_slice(&effective_source[last_end..fix.span.start]);
1063 buf.extend_from_slice(fix.replacement.as_bytes());
1064 last_end = fix.span.end;
1065 }
1066 if !deadline_aborted {
1067 // Append the tail after the last fix (or the full source if
1068 // there were no fixes).
1069 buf.extend_from_slice(&effective_source[last_end..]);
1070 }
1071 // Audit records: original descending order, matching DryRun so
1072 // the two modes produce identical `applied` orderings.
1073 if !deadline_aborted {
1074 for fix in kept_fixes {
1075 if deadline_expired(deadline) {
1076 deadline_aborted = true;
1077 break;
1078 }
1079 applied_keys.insert((fix.rule.clone(), fix.span));
1080 applied.push(AppliedFix::__engine_promote(
1081 fix,
1082 now,
1083 classifier_id.clone(),
1084 dry_run,
1085 None, // input identifier set by CLI at the boundary
1086 engine_promotion_token(),
1087 ));
1088 }
1089 }
1090 buf
1091 }
1092 FixMode::DryRun => {
1093 for fix in kept_fixes {
1094 if deadline_expired(deadline) {
1095 deadline_aborted = true;
1096 break;
1097 }
1098 applied_keys.insert((fix.rule.clone(), fix.span));
1099 applied.push(AppliedFix::__engine_promote(
1100 fix,
1101 now,
1102 classifier_id.clone(),
1103 dry_run,
1104 None,
1105 engine_promotion_token(),
1106 ));
1107 }
1108 source.to_vec()
1109 }
1110 };
1111
1112 if deadline_aborted {
1113 // `partial_lint` carries the full diagnostics produced by
1114 // the lint phase that completed before the apply loop ran.
1115 // The apply loop ran partially; per Constitution V
1116 // Principle V, that partial state is dropped on the floor
1117 // and the caller sees only the lint result. Pass-1 text
1118 // corrections that were applied are also discarded — the
1119 // audit stream gets nothing from this call.
1120 return Err(EngineError::DeadlineExceeded { partial_lint: lint });
1121 }
1122
1123 // Prepend pass-1 text corrections to the applied list so they
1124 // appear in the audit trail.
1125 let mut all_applied = pass1_applied;
1126 all_applied.extend(applied);
1127
1128 // Remaining diagnostics: those whose fix was not applied.
1129 // Filter by (rule_id, span) pair — not just rule ID — so that if
1130 // rule E001 fires on three spans and only one is fixed, the other
1131 // two remain.
1132 let remaining_diagnostics = lint
1133 .diagnostics
1134 .into_iter()
1135 .filter(|d| {
1136 !d.fix
1137 .as_ref()
1138 .is_some_and(|f| applied_keys.contains(&(f.rule.clone(), f.span)))
1139 })
1140 .collect();
1141
1142 Ok(FixResult {
1143 source: output,
1144 applied: all_applied,
1145 remaining_diagnostics,
1146 })
1147 }
1148
1149 /// Apply pre-scanner text corrections (C001) from lint diagnostics and
1150 /// return the corrected source + applied fixes. Used by `fix_inner` to
1151 /// produce an intermediate source that the scanner can detect.
1152 fn apply_text_corrections(
1153 &self,
1154 source: &[u8],
1155 lint: &LintResult,
1156 threshold: f32,
1157 mode: FixMode,
1158 ) -> (Vec<u8>, Vec<AppliedFix>) {
1159 // Mirror `fix_inner`'s suggest-channel exclusion: a C001
1160 // diagnostic that the lint post-pass rewrote to
1161 // `Severity::Suggest` (because its confidence fell below
1162 // threshold) must not be auto-applied here either.
1163 let mut text_fixes: Vec<&FixProposal> = lint
1164 .diagnostics
1165 .iter()
1166 .filter(|d| d.rule.as_str() == "C001")
1167 .filter(|d| d.severity != Severity::Suggest)
1168 .filter_map(|d| d.fix.as_ref())
1169 .filter(|f| f.source == FixSource::CorrectionsMap)
1170 .filter(|f| f.confidence.combined() >= threshold)
1171 .filter(|f| !f.span.is_empty())
1172 .collect();
1173
1174 if text_fixes.is_empty() {
1175 return (source.to_vec(), Vec::new());
1176 }
1177
1178 // Sort and deduplicate using FR-016 order + C-1 overlap guard.
1179 text_fixes.sort_by(|a, b| {
1180 b.span
1181 .end
1182 .cmp(&a.span.end)
1183 .then(b.span.start.cmp(&a.span.start))
1184 .then(a.rule.cmp(&b.rule))
1185 .then(a.replacement.cmp(&b.replacement))
1186 });
1187 let mut kept: Vec<&FixProposal> = Vec::new();
1188 let mut next_end: Option<usize> = None;
1189 for fix in &text_fixes {
1190 let fits = next_end.is_none_or(|b| fix.span.end <= b);
1191 if fits {
1192 next_end = Some(fix.span.start);
1193 kept.push(*fix);
1194 }
1195 }
1196
1197 let classifier_id: Option<Arc<str>> =
1198 self.config.user.classifier_id.as_deref().map(Arc::from);
1199 let dry_run = mode == FixMode::DryRun;
1200 let now = self.clock.now();
1201
1202 // Always apply text corrections to the intermediate buffer, even in
1203 // DryRun mode. This buffer is internal — pass 2 needs it to re-lint
1204 // corrected text so downstream rules fire (e.g., E001 on NF after
1205 // SERCET→SECRET). The final output for DryRun returns the original
1206 // source in fix_inner, not this intermediate buffer.
1207 let mut buf = source.to_vec();
1208 let mut applied = Vec::with_capacity(kept.len());
1209 for fix in &kept {
1210 buf.splice(fix.span.start..fix.span.end, fix.replacement.bytes());
1211 applied.push(AppliedFix::__engine_promote(
1212 (*fix).clone(),
1213 now,
1214 classifier_id.clone(),
1215 dry_run,
1216 None,
1217 engine_promotion_token(),
1218 ));
1219 }
1220
1221 (buf, applied)
1222 }
1223}
1224
1225// ---------------------------------------------------------------------------
1226// Engine-only AppliedFix promotion gate (Constitution V Principle V)
1227// ---------------------------------------------------------------------------
1228
1229/// Mint an [`EnginePromotionToken`] for [`AppliedFix::__engine_promote`].
1230///
1231/// This is the **single** place inside `marque-engine` where the engine
1232/// grants itself the privilege to promote a `FixProposal` to an
1233/// `AppliedFix`. Constitution V Principle V scopes audit-record
1234/// promotion to `Engine::fix_inner` and `Engine::apply_text_corrections`
1235/// (the three production call sites in this file). Centralizing the
1236/// token construction here makes "where does the engine decide to
1237/// promote?" a one-grep question, and means a future refactor that
1238/// adds a fourth promotion site has to thread through this function
1239/// — a deliberate decision, not an accident.
1240///
1241/// `EnginePromotionToken`'s sole field is private to `marque-rules`,
1242/// so external crates cannot brace-construct one. The
1243/// `__engine_construct` constructor on the token is `#[doc(hidden)]`
1244/// and named to make its intent unmistakable to anyone reading a call
1245/// site outside the engine.
1246#[inline]
1247fn engine_promotion_token() -> EnginePromotionToken {
1248 EnginePromotionToken::__engine_construct()
1249}
1250
1251// ---------------------------------------------------------------------------
1252// Decoder-path diagnostic synthesis (Phase 4 PR-4b — T068)
1253// ---------------------------------------------------------------------------
1254
1255/// Build the synthetic `R001 decoder-recognition` diagnostic the engine
1256/// emits when a recognizer returned a marking carrying
1257/// [`DecoderProvenance`]. Returns `None` when the original or canonical
1258/// bytes are not valid UTF-8 — `FixProposal` carries `Box<str>` for both
1259/// `original` and `replacement`, so we cannot construct the proposal
1260/// without UTF-8 validity. CAPCO markings are ASCII by spec (CAPCO-2016
1261/// §A.6); a non-UTF-8 result here would mean the canonicalization pass
1262/// produced something the strict parser shouldn't have accepted, which
1263/// is a separate bug to surface — silently dropping the synthetic
1264/// diagnostic is the conservative move.
1265///
1266/// # Audit-shape contract (Constitution V Principle V / G13)
1267///
1268/// The diagnostic's `message` and the synthesized `FixProposal.original`
1269/// MUST NOT carry verbatim input bytes — only token canonicals, span
1270/// offsets, and digests/posterior scalars are permitted in audit
1271/// output. The "before" form is therefore omitted from the message
1272/// and `proposal.original` is set to the empty string for
1273/// decoder-path R001 records: span tells the audit consumer *where*
1274/// the fix landed, `proposal.replacement` carries *what* it became.
1275/// The original bytes already exist in the source document; the audit
1276/// record is not the right channel for them.
1277///
1278/// Note: this contract addresses the audit-record *shape*. A separate
1279/// upstream concern is whether `proposal.replacement` itself is a
1280/// well-formed canonical (Constitution V permits "token canonicals"
1281/// in audit output). When the decoder accepts unrecognized bytes as a
1282/// compartment-shaped token and uppercases them, the resulting
1283/// "canonical" carries those bytes through `replacement` — that's a
1284/// decoder-correctness issue to address separately.
1285///
1286/// The fix's `Confidence` is populated entirely from the decoder's
1287/// provenance trace:
1288///
1289/// - `recognition` derives from `runner_up_ratio` via softmax (see
1290/// [`DecoderProvenance::recognition_score`]); strictly less than
1291/// `1.0` so audit consumers can distinguish strict from decoder
1292/// provenance via a single field comparison.
1293/// - `rule` is `1.0` — once the decoder has decided unambiguously the
1294/// recognition-layer rewrite is itself unambiguous (rewrite the
1295/// observed bytes to canonical bytes), so the rule axis carries no
1296/// additional uncertainty. The decoder's recognition uncertainty is
1297/// already captured in `recognition`.
1298/// - `runner_up_ratio` and `features` thread through verbatim from the
1299/// provenance.
1300/// - When `corpus_override_active` is `true`, an extra
1301/// [`FeatureId::CorpusOverrideInEffect`] contribution with
1302/// `delta = 0.0` is appended to `features`. The zero delta is
1303/// load-bearing: PR-5 minimal scope wires the surface end-to-end
1304/// without yet substituting override priors into decoder scoring,
1305/// so the contribution is purely an audit-trail marker
1306/// ("this fix was produced under organizational overrides")
1307/// rather than an actual posterior shift. A future PR that wires
1308/// override-prior substitution will replace `0.0` with the real
1309/// delta and re-version the audit schema.
1310fn build_decoder_diagnostic(
1311 span: Span,
1312 original_bytes: &[u8],
1313 provenance: &DecoderProvenance,
1314 _kind: marque_ism::MarkingType,
1315 corpus_override_active: bool,
1316) -> Option<Diagnostic> {
1317 use marque_rules::confidence::{FeatureContribution, FeatureId};
1318
1319 let original = std::str::from_utf8(original_bytes).ok()?;
1320 let replacement = std::str::from_utf8(&provenance.canonical_bytes).ok()?;
1321
1322 // No-op rewrite (canonicalization preserved bytes byte-for-byte) is
1323 // not informative and would produce a degenerate audit record; skip.
1324 if original == replacement {
1325 return None;
1326 }
1327
1328 let mut features: Vec<FeatureContribution> = provenance.features.to_vec();
1329 if corpus_override_active {
1330 features.push(FeatureContribution {
1331 id: FeatureId::CorpusOverrideInEffect,
1332 delta: 0.0,
1333 });
1334 }
1335
1336 // Dispatch on the decoder's `fix_source`. Standard vocab-based
1337 // recognition emits at `Severity::Fix` with `rule = 1.0` (engine
1338 // applies whenever `recognition >= confidence_threshold`). The
1339 // position-aware classification heuristic (issue #133 PR 2) emits
1340 // at `Severity::Warn` (always-visible in `--check`, non-zero exit
1341 // code) with `rule = HEURISTIC_RULE_AXIS_CAP = 0.95` matching the
1342 // default `confidence_threshold`. PR 4's empirical corpus
1343 // measurement justifies the `0.95` value — see the cap's doc
1344 // comment for the analysis script and measured numbers.
1345 let (severity, rule_axis, fix_source) = match provenance.fix_source {
1346 FixSource::DecoderClassificationHeuristic => (
1347 Severity::Warn,
1348 HEURISTIC_RULE_AXIS_CAP,
1349 FixSource::DecoderClassificationHeuristic,
1350 ),
1351 // All non-heuristic decoder paths use the existing posterior
1352 // shape. Strict-source variants (BuiltinRule, CorrectionsMap,
1353 // MigrationTable) do not flow through this builder — they
1354 // come from rule-pipeline emissions, not the decoder — so
1355 // routing them to `DecoderPosterior` here is a defensive
1356 // default that preserves the existing strict-decoder shape
1357 // for any future fix-source variant.
1358 _ => (Severity::Fix, 1.0, FixSource::DecoderPosterior),
1359 };
1360
1361 let confidence = Confidence {
1362 recognition: provenance.recognition_score(),
1363 rule: rule_axis,
1364 region: None,
1365 runner_up_ratio: provenance.runner_up_ratio,
1366 features,
1367 };
1368 let rule = RuleId::new(DECODER_RULE_ID);
1369 // Audit-shape contract: `proposal.original` is the empty string for
1370 // decoder-path R001 records (Constitution V Principle V / G13). The
1371 // span identifies *where* the fix landed; the bytes are still in
1372 // the source document. The unused `original` binding documents that
1373 // we held UTF-8 validity for the input but intentionally do not
1374 // route it into the audit record.
1375 let _ = original;
1376 let proposal = FixProposal::new(
1377 rule.clone(),
1378 fix_source,
1379 span,
1380 "",
1381 replacement,
1382 confidence,
1383 None,
1384 );
1385 Some(Diagnostic::new(
1386 rule,
1387 severity,
1388 span,
1389 format!("decoder-recognized canonical form: {replacement:?}"),
1390 DECODER_CITATION,
1391 Some(proposal),
1392 ))
1393}
1394
1395/// `Confidence::rule` cap for the position-aware classification
1396/// heuristic (`FixSource::DecoderClassificationHeuristic`). Pinned
1397/// at `0.95` matching the default `confidence_threshold` — solo-
1398/// candidate heuristic fixes auto-apply at the default threshold;
1399/// multi-candidate cases (heuristic plus a competing recovery)
1400/// drop below `0.95` because `recognition` falls with the runner-
1401/// up margin and the user retains agency to verify. The diagnostic
1402/// is always emitted at [`Severity::Warn`](marque_rules::Severity::Warn)
1403/// regardless of confidence, so `--check` exits non-zero whenever
1404/// the heuristic fires.
1405///
1406/// # Empirical justification (issue #133 PR 4)
1407///
1408/// The relevant FP rate isn't "trigger appears in arbitrary prose"
1409/// but "trigger appears as a standalone token in a context that
1410/// also contains marking-shape signals (`//` outside URLs, or any
1411/// CAPCO marking long-form like `NOFORN`/`SECRET`/`REL TO`/etc.)
1412/// within proximity" — because the decoder heuristic only fires
1413/// when the strict parse fails on input that's already
1414/// marking-shaped. PR 2's initial guess of `0.80` was based on the
1415/// reading "we can't be 97% sure"; PR 4 measured the conditional
1416/// FP rate against the full Enron corpus and confirmed the
1417/// in-context heuristic is well-calibrated above `0.95`.
1418///
1419/// Headline numbers from the committed evidence file
1420/// (`tools/corpus-analysis/output/heuristic_frequencies.json`,
1421/// case-insensitive scan over 510,596 Enron documents — case-
1422/// insensitive because the decoder uppercases inputs before running
1423/// the heuristic, so a runtime-faithful measurement must capture
1424/// lowercase trigger appearances too):
1425///
1426/// - **11 of 37 triggers** have zero marking-context hits across
1427/// the corpus (the case-sensitive prior measurement reported
1428/// 23/37, but those numbers undercounted the runtime distribution).
1429/// - The worst-case per-occurrence in-context rate is `V` at
1430/// 814/23,331 ≈ 3.49% (`V`→`C` heuristic). Interpreted as "of
1431/// every 100 standalone `V` tokens in body text, ~3.5 sit
1432/// within ~30 chars of a marking-shape signal." Corresponds to
1433/// ~96.5% per-occurrence precision — still above the 0.95 cap,
1434/// though with thinner headroom than the prior measurement
1435/// showed.
1436/// - Most other non-zero triggers stay below ~1.5% per-occurrence
1437/// (A: 0.15%, E: 0.34%, RE: 0.19%, W: 0.94%, F: 0.50%, etc.).
1438///
1439/// **Cap calibration**: the 0.95 cap is justified by the measured
1440/// per-occurrence in-context rates above. Two prior framings of
1441/// this paragraph (a "5,000-file sample" with hand-derived numbers
1442/// and a "Bayesian credible upper bound ≥ 99.94%" calculation) were
1443/// dropped because (a) the sample numbers were superseded by the
1444/// full-corpus measurement, and (b) the Bayesian calculation used
1445/// a different denominator (`marking_context / total_docs`) than
1446/// the per-occurrence rate (`marking_context / unrestricted`),
1447/// making them not directly comparable. Use the measured per-
1448/// occurrence rates directly.
1449///
1450/// **Important caveat — loose upper bound**: the per-occurrence rate
1451/// is an UPPER BOUND on the heuristic's true FP rate, not the rate
1452/// itself. The metric counts "trigger token appears within ~30 chars
1453/// of a marking signal," which catches every potential heuristic-
1454/// fire input but ALSO includes many that the
1455/// [`try_classification_heuristic_fix`](crate::decoder)
1456/// guards (lone-input check, leading-position requirement,
1457/// multi-token-after-leading-position requirement) would filter out
1458/// before the heuristic ever fires. The true FP rate is likely well
1459/// below the worst-case 3.49% bound — but if real-world deployment
1460/// shows V-shaped triggers producing too many false positives, the
1461/// per-trigger plumbing originally proposed for PR 4 should land
1462/// (skip-list V, drop its rule confidence, etc.).
1463///
1464/// Spot-check the evidence file for per-trigger detail; this doc
1465/// summarizes qualitatively to avoid drift if the file is
1466/// regenerated against a different corpus.
1467///
1468/// To re-measure (e.g., when a different corpus is added):
1469///
1470/// ```text
1471/// python3 tools/corpus-analysis/analyze.py \
1472/// --mode heuristic-frequency \
1473/// --output tools/corpus-analysis/output/heuristic_frequencies.json
1474/// ```
1475///
1476/// If a future measurement shows a trigger's marking-context FP
1477/// rate above ~1% (e.g., a corpus that contains heavy use of one
1478/// of these tokens in a marking-adjacent way), this cap should
1479/// drop or the per-trigger plumbing originally proposed for PR 4
1480/// should land. Pinned at the engine boundary by
1481/// `engine::tests::heuristic_rule_axis_cap_matches_default_threshold`.
1482const HEURISTIC_RULE_AXIS_CAP: f32 = 0.95;
1483
1484// ---------------------------------------------------------------------------
1485// Rule-override canonicalization (task #49)
1486// ---------------------------------------------------------------------------
1487
1488/// Resolve every key in `config.rules.overrides` against the registered
1489/// rule sets. Both the rule ID (`"E001"`) and the rule name
1490/// (`"portion-mark-in-banner"`) are accepted — after canonicalization
1491/// the override map keys by canonical ID only, and the per-rule lookup
1492/// in `lint()` / `fix_inner()` keeps working unchanged.
1493///
1494/// Fails closed on:
1495/// - **Unknown keys** — `E999 = "warn"` or `not-a-rule = "error"` → the
1496/// user has almost certainly typo'd a rule reference. Silent acceptance
1497/// (the pre-#49 behavior) means the user thought they were configuring
1498/// the rule, but nothing happened at lint time. Emits
1499/// `EngineConstructionError::UnknownRuleOverride` with a best-effort
1500/// `did_you_mean` suggestion (Levenshtein ≤ 3 against the union of
1501/// known IDs and names).
1502/// - **Conflicting duplicate forms** — `E001 = "warn"` AND
1503/// `portion-mark-in-banner = "error"` in the same merged config →
1504/// the two entries resolved to the same rule but with different
1505/// severities. One form would have silently won the HashMap race.
1506/// Emits `EngineConstructionError::ConflictingRuleOverride`.
1507///
1508/// Duplicate forms with the *same* severity are silently accepted —
1509/// a user writing both `E001 = "warn"` and `portion-mark-in-banner =
1510/// "warn"` (intentionally or via copy-paste across config layers) gets
1511/// the expected behavior.
1512fn canonicalize_rule_overrides(
1513 config: &mut Config,
1514 rule_sets: &[Box<dyn RuleSet>],
1515) -> Result<(), EngineConstructionError> {
1516 if config.rules.overrides.is_empty() {
1517 return Ok(());
1518 }
1519
1520 // Build the ID-and-name → canonical-ID lookup. Both sides live in
1521 // `&'static str` (RuleId's inner slice, rule.name()), so the map's
1522 // keys and values are all `'static`.
1523 let mut known: HashMap<&'static str, &'static str> = HashMap::new();
1524 for rule_set in rule_sets {
1525 for rule in rule_set.rules() {
1526 let id_str = rule.id().as_str();
1527 let name = rule.name();
1528 known.insert(id_str, id_str);
1529 known.insert(name, id_str);
1530 }
1531 }
1532
1533 // Walk the raw overrides; resolve each key to its canonical ID, and
1534 // track which source key contributed each canonical entry so we can
1535 // report both sides of a conflict.
1536 let raw = std::mem::take(&mut config.rules.overrides);
1537 let mut by_rule: HashMap<&'static str, (String, String)> = HashMap::new();
1538 for (key, value) in raw {
1539 match known.get(key.as_str()) {
1540 Some(&canonical_id) => {
1541 if let Some((prev_key, prev_sev)) = by_rule.get(canonical_id) {
1542 if prev_sev != &value {
1543 return Err(EngineConstructionError::ConflictingRuleOverride {
1544 rule_id: canonical_id.to_owned(),
1545 keys: Box::new([prev_key.clone(), key]),
1546 severities: Box::new([prev_sev.clone(), value]),
1547 });
1548 }
1549 // Duplicate form, same severity — accept silently.
1550 } else {
1551 by_rule.insert(canonical_id, (key, value));
1552 }
1553 }
1554 None => {
1555 let did_you_mean = suggest_closest(&key, known.keys().copied());
1556 return Err(EngineConstructionError::UnknownRuleOverride { key, did_you_mean });
1557 }
1558 }
1559 }
1560
1561 config.rules.overrides = by_rule
1562 .into_iter()
1563 .map(|(id, (_, sev))| (id.to_owned(), sev))
1564 .collect();
1565 Ok(())
1566}
1567
1568/// Best-effort string extraction from a `catch_unwind` payload.
1569///
1570/// Rust panic payloads are `Box<dyn Any + Send>`. The standard
1571/// shapes a `panic!()` produces are `&'static str` (literal message)
1572/// and `String` (formatted message); arbitrary types are also
1573/// permissible. We try the two common cases and fall back to a
1574/// generic placeholder so the warning we emit always carries
1575/// *something* identifying the rule even if a future crate panics
1576/// with a custom payload type.
1577fn panic_payload_to_string(
1578 payload: &Box<dyn std::any::Any + Send + 'static>,
1579) -> std::borrow::Cow<'static, str> {
1580 if let Some(s) = payload.downcast_ref::<&'static str>() {
1581 std::borrow::Cow::Borrowed(*s)
1582 } else if let Some(s) = payload.downcast_ref::<String>() {
1583 std::borrow::Cow::Owned(s.clone())
1584 } else {
1585 std::borrow::Cow::Borrowed("<unstringifiable panic payload>")
1586 }
1587}
1588
1589/// Return the closest known rule key (ID or name) to `needle` by
1590/// Levenshtein distance, if the closest candidate is within a small
1591/// edit-distance threshold. Threshold scales with `needle.len()`: short
1592/// strings only match on ≤ 1 edit, longer strings tolerate more.
1593///
1594/// Returns `None` when no candidate is close enough to be useful —
1595/// "did you mean 'REL-TO-noforn-supersession'?" for a user who typed
1596/// "E999" would be worse than no suggestion at all.
1597fn suggest_closest<'a, I>(needle: &str, candidates: I) -> Option<String>
1598where
1599 I: Iterator<Item = &'a str>,
1600{
1601 // Keep the threshold tight so we don't suggest matches that share
1602 // only a couple of characters. The max-distance formula mirrors
1603 // what rustc uses for its "did you mean" hints:
1604 // - length 0–3: 1 edit max (too short to suggest at all, really)
1605 // - length 4–7: 2 edits max
1606 // - length 8+: 3 edits max
1607 let max_distance = match needle.len() {
1608 0..=3 => 1,
1609 4..=7 => 2,
1610 _ => 3,
1611 };
1612
1613 let mut best: Option<(&'a str, usize)> = None;
1614 for cand in candidates {
1615 let dist = levenshtein(needle, cand);
1616 if dist > max_distance {
1617 continue;
1618 }
1619 match best {
1620 Some((_, prev_dist)) if dist >= prev_dist => {}
1621 _ => best = Some((cand, dist)),
1622 }
1623 }
1624 best.map(|(cand, _)| cand.to_owned())
1625}
1626
1627/// Levenshtein edit distance between two byte strings. Small, inlineable,
1628/// no external dependency — the engine crate is on the WASM-safe surface
1629/// and adding a new runtime dep for a once-per-construction helper would
1630/// be a disproportionate trade (Constitution III).
1631///
1632/// Operates on bytes, not `char`s: rule IDs and names are ASCII by
1633/// construction, so the byte-level diff equals the codepoint-level diff.
1634fn levenshtein(a: &str, b: &str) -> usize {
1635 let a = a.as_bytes();
1636 let b = b.as_bytes();
1637 let (m, n) = (a.len(), b.len());
1638 if m == 0 {
1639 return n;
1640 }
1641 if n == 0 {
1642 return m;
1643 }
1644 // Two-row DP: only the previous row is needed at any step.
1645 let mut prev: Vec<usize> = (0..=n).collect();
1646 let mut curr: Vec<usize> = vec![0; n + 1];
1647 for i in 1..=m {
1648 curr[0] = i;
1649 for j in 1..=n {
1650 let cost = if a[i - 1] == b[j - 1] { 0 } else { 1 };
1651 curr[j] = (prev[j] + 1).min(curr[j - 1] + 1).min(prev[j - 1] + cost);
1652 }
1653 std::mem::swap(&mut prev, &mut curr);
1654 }
1655 prev[n]
1656}
1657
1658// ---------------------------------------------------------------------------
1659// Tests
1660// ---------------------------------------------------------------------------
1661
1662#[cfg(test)]
1663#[cfg_attr(coverage_nightly, coverage(off))]
1664mod tests {
1665 use super::*;
1666 use crate::clock::FixedClock;
1667 use marque_ism::IsmAttributes;
1668 use marque_rules::{
1669 Diagnostic, FixProposal, FixSource, Rule, RuleContext, RuleId, RuleSet, Severity,
1670 };
1671 use std::time::{Duration, UNIX_EPOCH};
1672
1673 #[test]
1674 fn heuristic_rule_axis_cap_matches_default_threshold() {
1675 // Issue #133 PR 4 invariant: the position-aware classification
1676 // heuristic's `Confidence::rule` cap is pinned at the default
1677 // `confidence_threshold` (0.95). Solo-candidate heuristic
1678 // fixes auto-apply at the default threshold; the empirical
1679 // corpus measurement (see `HEURISTIC_RULE_AXIS_CAP` doc and
1680 // `tools/corpus-analysis/output/heuristic_frequencies.json`)
1681 // justifies confidence ≥ 99.4% per-trigger, comfortably above
1682 // the cap.
1683 //
1684 // If a future change drops `HEURISTIC_RULE_AXIS_CAP` below
1685 // `Config::default().confidence_threshold()`, that's a
1686 // behavioral regression: heuristic fixes that previously auto-
1687 // applied at the default threshold would silently stop
1688 // applying, and the user-visible "fix-and-warn" surface
1689 // collapses to "warn-only-without-fix" without an explicit
1690 // intent recorded in the change.
1691 //
1692 // If a future change drops the default `confidence_threshold`
1693 // below `HEURISTIC_RULE_AXIS_CAP`, that's the inverse problem:
1694 // the heuristic suddenly becomes more aggressive than the
1695 // governance signal we agreed on. Either way, the equality
1696 // pin here forces a coordinated decision.
1697 let default_threshold = Config::default().confidence_threshold();
1698 assert!(
1699 (HEURISTIC_RULE_AXIS_CAP - default_threshold).abs() < 1e-6,
1700 "HEURISTIC_RULE_AXIS_CAP={HEURISTIC_RULE_AXIS_CAP} must equal \
1701 Config::default().confidence_threshold()={default_threshold}; \
1702 a divergence requires an intentional governance change recorded \
1703 in the cap's doc comment"
1704 );
1705 }
1706
1707 /// A test rule that emits a fixed list of FixProposals on every check call,
1708 /// ignoring the parsed attributes. Lets us drive the engine deterministically
1709 /// without depending on real CAPCO rule output.
1710 struct StubRule {
1711 id: &'static str,
1712 proposals: Vec<FixProposal>,
1713 }
1714
1715 impl Rule for StubRule {
1716 fn id(&self) -> RuleId {
1717 RuleId::new(self.id)
1718 }
1719 fn name(&self) -> &'static str {
1720 "stub"
1721 }
1722 fn default_severity(&self) -> Severity {
1723 Severity::Fix
1724 }
1725 fn check(&self, _attrs: &IsmAttributes, _ctx: &RuleContext) -> Vec<Diagnostic> {
1726 self.proposals
1727 .iter()
1728 .map(|p| {
1729 Diagnostic::new(
1730 p.rule.clone(),
1731 Severity::Fix,
1732 p.span,
1733 "stub",
1734 "TEST",
1735 Some(p.clone()),
1736 )
1737 })
1738 .collect()
1739 }
1740 }
1741
1742 struct StubSet(Vec<Box<dyn Rule>>);
1743 impl RuleSet for StubSet {
1744 fn rules(&self) -> &[Box<dyn Rule>] {
1745 &self.0
1746 }
1747 fn schema_version(&self) -> &'static str {
1748 "TEST"
1749 }
1750 }
1751
1752 fn proposal(rule: &'static str, start: usize, end: usize, replacement: &str) -> FixProposal {
1753 proposal_with_confidence(rule, start, end, replacement, 1.0)
1754 }
1755
1756 fn proposal_with_confidence(
1757 rule: &'static str,
1758 start: usize,
1759 end: usize,
1760 replacement: &str,
1761 confidence: f32,
1762 ) -> FixProposal {
1763 FixProposal::new(
1764 RuleId::new(rule),
1765 FixSource::BuiltinRule,
1766 Span::new(start, end),
1767 "x",
1768 replacement,
1769 marque_rules::Confidence::strict(confidence),
1770 None,
1771 )
1772 }
1773
1774 fn engine_with(proposals: Vec<FixProposal>) -> Engine {
1775 engine_with_config(Config::default(), proposals)
1776 }
1777
1778 fn engine_with_config(config: Config, proposals: Vec<FixProposal>) -> Engine {
1779 let stub = StubRule {
1780 id: "TEST",
1781 proposals,
1782 };
1783 let set: Box<dyn RuleSet> = Box::new(StubSet(vec![Box::new(stub)]));
1784 Engine::with_clock(
1785 config,
1786 vec![set],
1787 marque_capco::scheme::CapcoScheme::new(),
1788 Box::new(FixedClock::new(
1789 UNIX_EPOCH + Duration::from_secs(1_700_000_000),
1790 )),
1791 )
1792 .expect("default CAPCO scheme has no rewrite cycles")
1793 }
1794
1795 /// A source long enough to span the test fix offsets, AND containing a
1796 /// banner marking so the parser produces a candidate that triggers
1797 /// the rule loop in `Engine::lint`.
1798 const TEST_SRC: &[u8] = b"SECRET//NOFORN ";
1799
1800 #[test]
1801 fn fix_applies_disjoint_fixes_in_reverse_order() {
1802 // Two non-overlapping fixes; FR-016 sorts by span.end DESC so the
1803 // later one is applied first, preserving the earlier span's offsets.
1804 let engine = engine_with(vec![
1805 proposal("E001", 0, 6, "AA"), // "SECRET" → "AA"
1806 proposal("E002", 8, 14, "BB"), // "NOFORN" → "BB"
1807 ]);
1808 let result = engine.fix(TEST_SRC, FixMode::Apply);
1809 let out = String::from_utf8(result.source).unwrap();
1810 assert!(out.starts_with("AA//BB"), "got: {out:?}");
1811 assert_eq!(result.applied.len(), 2);
1812 }
1813
1814 #[test]
1815 fn overlap_guard_drops_overlapping_fix() {
1816 // Two fixes whose spans collide. C-1: keep one, drop the other.
1817 let engine = engine_with(vec![
1818 proposal("E001", 0, 6, "AA"),
1819 proposal("E002", 3, 10, "BB"), // overlaps E001
1820 ]);
1821 let result = engine.fix(TEST_SRC, FixMode::Apply);
1822 // Exactly one fix should be applied, the other should remain in
1823 // `remaining_diagnostics` so callers can see it was not silently
1824 // dropped.
1825 assert_eq!(result.applied.len(), 1, "applied: {:?}", result.applied);
1826 assert_eq!(
1827 result.remaining_diagnostics.len(),
1828 1,
1829 "remaining: {:?}",
1830 result.remaining_diagnostics
1831 );
1832 }
1833
1834 #[test]
1835 fn dry_run_returns_original_source_but_records_applied() {
1836 let engine = engine_with(vec![proposal("E001", 0, 6, "AA")]);
1837 let result = engine.fix(TEST_SRC, FixMode::DryRun);
1838 assert_eq!(result.source, TEST_SRC, "dry-run must not mutate source");
1839 assert_eq!(result.applied.len(), 1);
1840 assert!(result.applied[0].dry_run, "dry_run flag must be set");
1841 }
1842
1843 #[test]
1844 fn fix_with_threshold_rejects_nan() {
1845 let engine = engine_with(vec![]);
1846 assert!(matches!(
1847 engine.fix_with_threshold(TEST_SRC, FixMode::Apply, Some(f32::NAN)),
1848 Err(InvalidThreshold(_))
1849 ));
1850 }
1851
1852 #[test]
1853 fn fix_with_threshold_rejects_out_of_range() {
1854 let engine = engine_with(vec![]);
1855 assert!(matches!(
1856 engine.fix_with_threshold(TEST_SRC, FixMode::Apply, Some(-0.1)),
1857 Err(InvalidThreshold(_))
1858 ));
1859 assert!(matches!(
1860 engine.fix_with_threshold(TEST_SRC, FixMode::Apply, Some(1.1)),
1861 Err(InvalidThreshold(_))
1862 ));
1863 }
1864
1865 #[test]
1866 fn fix_with_threshold_accepts_boundaries() {
1867 let engine = engine_with(vec![]);
1868 assert!(
1869 engine
1870 .fix_with_threshold(TEST_SRC, FixMode::Apply, Some(0.0))
1871 .is_ok()
1872 );
1873 assert!(
1874 engine
1875 .fix_with_threshold(TEST_SRC, FixMode::Apply, Some(1.0))
1876 .is_ok()
1877 );
1878 }
1879
1880 #[test]
1881 fn fixed_clock_yields_deterministic_timestamps() {
1882 let engine = engine_with(vec![proposal("E001", 0, 6, "AA")]);
1883 let r1 = engine.fix(TEST_SRC, FixMode::Apply);
1884 let r2 = engine.fix(TEST_SRC, FixMode::Apply);
1885 assert_eq!(r1.applied[0].timestamp, r2.applied[0].timestamp);
1886 }
1887
1888 // H-3: fix_with_threshold must reject non-finite overrides in all
1889 // directions, not just NaN. INFINITY and NEG_INFINITY are both caught
1890 // by the range check; this test pins that behavior so a future refactor
1891 // that uses e.g. `is_finite` instead of `contains + is_nan` cannot
1892 // silently regress.
1893 #[test]
1894 fn fix_with_threshold_rejects_infinity() {
1895 let engine = engine_with(vec![]);
1896 assert!(matches!(
1897 engine.fix_with_threshold(TEST_SRC, FixMode::Apply, Some(f32::INFINITY)),
1898 Err(InvalidThreshold(_))
1899 ));
1900 assert!(matches!(
1901 engine.fix_with_threshold(TEST_SRC, FixMode::Apply, Some(f32::NEG_INFINITY)),
1902 Err(InvalidThreshold(_))
1903 ));
1904 }
1905
1906 // M-4: the confidence filter at `f.confidence.combined() >= threshold`
1907 // is on the hot path of Engine::fix. These two tests pin the `>=`
1908 // semantics so a future refactor that flips it to `>` (or vice versa)
1909 // is caught. "Confidence" here is the scalar `Confidence::combined()`
1910 // (= recognition × rule); the other axes (`region`, `runner_up_ratio`,
1911 // feature contributions) are audit-provenance metadata and do not
1912 // participate in the threshold gate.
1913 #[test]
1914 fn confidence_below_default_threshold_is_excluded() {
1915 // Config::default().confidence_threshold == 0.95. A fix at 0.94
1916 // must not be applied.
1917 let engine = engine_with(vec![proposal_with_confidence("E001", 0, 6, "AA", 0.94)]);
1918 let result = engine.fix(TEST_SRC, FixMode::Apply);
1919 assert_eq!(result.applied.len(), 0);
1920 // The below-threshold fix is a suggestion — it survives in
1921 // remaining_diagnostics so the caller can surface it.
1922 assert_eq!(result.remaining_diagnostics.len(), 1);
1923 }
1924
1925 #[test]
1926 fn lint_rewrites_below_threshold_fix_severity_to_suggest() {
1927 // Issue #235 / #186 PR-3: the lint post-pass turns a Fix-severity
1928 // diagnostic carrying a sub-threshold proposal into a Suggest-
1929 // severity diagnostic, preserving the fix payload so the renderer
1930 // can show "did you mean?" instead of silently dropping the
1931 // candidate at the threshold gate.
1932 let engine = engine_with(vec![proposal_with_confidence("E001", 0, 6, "AA", 0.5)]);
1933 let lint = engine.lint(TEST_SRC);
1934 assert_eq!(lint.diagnostics.len(), 1);
1935 assert_eq!(lint.diagnostics[0].severity, Severity::Suggest);
1936 assert!(
1937 lint.diagnostics[0].fix.is_some(),
1938 "the candidate fix must stay attached so the renderer can surface it"
1939 );
1940 assert_eq!(lint.suggest_count(), 1);
1941 // Confirm the engine still excludes Suggest from auto-apply.
1942 let fix_result = engine.fix(TEST_SRC, FixMode::Apply);
1943 assert_eq!(fix_result.applied.len(), 0);
1944 }
1945
1946 #[test]
1947 fn lint_does_not_rewrite_at_threshold_boundary() {
1948 // A fix at exactly the threshold (0.95) must NOT be rewritten
1949 // — it is auto-apply territory, not Suggest territory. This
1950 // pins the boundary semantics: the rewrite predicate is
1951 // strictly less-than, matching the engine's `>= threshold`
1952 // application gate.
1953 let engine = engine_with(vec![proposal_with_confidence("E001", 0, 6, "AA", 0.95)]);
1954 let lint = engine.lint(TEST_SRC);
1955 assert_eq!(lint.diagnostics.len(), 1);
1956 assert_eq!(lint.diagnostics[0].severity, Severity::Fix);
1957 }
1958
1959 #[test]
1960 fn lint_post_pass_leaves_fix_severity_with_no_fix_payload_alone() {
1961 // The post-pass guard order matters: even though `Fix`-severity
1962 // diagnostics are the only ones eligible for the rewrite, a
1963 // diagnostic that doesn't carry a `FixProposal` (rare in
1964 // practice — `Fix`-severity rules normally always attach one
1965 // — but representable in the type) must be skipped by the
1966 // `let Some(fix) = d.fix.as_ref() else { continue }` arm and
1967 // keep its `Fix` severity. This pins the behavior so a future
1968 // refactor that hoists the threshold check above the fix-
1969 // presence check (and might rewrite to Suggest unconditionally)
1970 // is caught.
1971 struct FixWithoutProposalRule;
1972 impl Rule for FixWithoutProposalRule {
1973 fn id(&self) -> RuleId {
1974 RuleId::new("E997")
1975 }
1976 fn name(&self) -> &'static str {
1977 "stub-fix-no-proposal"
1978 }
1979 fn default_severity(&self) -> Severity {
1980 Severity::Fix
1981 }
1982 fn check(&self, _attrs: &IsmAttributes, _ctx: &RuleContext) -> Vec<Diagnostic> {
1983 vec![Diagnostic::new(
1984 RuleId::new("E997"),
1985 Severity::Fix,
1986 Span::new(0, 6),
1987 "fix-severity diagnostic with no proposal",
1988 "TEST",
1989 None,
1990 )]
1991 }
1992 }
1993
1994 let set: Box<dyn RuleSet> = Box::new(StubSet(vec![Box::new(FixWithoutProposalRule)]));
1995 let engine = Engine::with_clock(
1996 Config::default(),
1997 vec![set],
1998 marque_capco::scheme::CapcoScheme::new(),
1999 Box::new(FixedClock::new(
2000 UNIX_EPOCH + Duration::from_secs(1_700_000_000),
2001 )),
2002 )
2003 .expect("default CAPCO scheme has no rewrite cycles");
2004
2005 let lint = engine.lint(TEST_SRC);
2006 assert_eq!(lint.diagnostics.len(), 1);
2007 assert_eq!(
2008 lint.diagnostics[0].severity,
2009 Severity::Fix,
2010 "Fix-severity diagnostic with no fix payload must NOT be rewritten to Suggest",
2011 );
2012 assert!(lint.diagnostics[0].fix.is_none());
2013 }
2014
2015 #[test]
2016 fn fix_excludes_explicit_suggest_severity_from_auto_apply() {
2017 // Issue #235 / #186 PR-3: a rule that emits at Severity::Suggest
2018 // directly with confidence ≥ threshold must STILL be excluded
2019 // from auto-apply by construction. The Suggest channel is a
2020 // hard "do not apply" signal regardless of the confidence
2021 // axis. This is the explicit-Suggest invariant; the StubRule
2022 // emits Fix-severity by default so we route through a custom
2023 // rule that emits Suggest directly.
2024 struct SuggestRule;
2025 impl Rule for SuggestRule {
2026 fn id(&self) -> RuleId {
2027 RuleId::new("S999")
2028 }
2029 fn name(&self) -> &'static str {
2030 "stub-suggest"
2031 }
2032 fn default_severity(&self) -> Severity {
2033 Severity::Suggest
2034 }
2035 fn check(&self, _attrs: &IsmAttributes, _ctx: &RuleContext) -> Vec<Diagnostic> {
2036 let proposal = FixProposal::new(
2037 RuleId::new("S999"),
2038 FixSource::BuiltinRule,
2039 Span::new(0, 6),
2040 "SECRET",
2041 "TOP SECRET",
2042 marque_rules::Confidence::strict(1.0),
2043 None,
2044 );
2045 vec![Diagnostic::new(
2046 RuleId::new("S999"),
2047 Severity::Suggest,
2048 Span::new(0, 6),
2049 "explicit suggest with high confidence",
2050 "TEST",
2051 Some(proposal),
2052 )]
2053 }
2054 }
2055
2056 let set: Box<dyn RuleSet> = Box::new(StubSet(vec![Box::new(SuggestRule)]));
2057 let engine = Engine::with_clock(
2058 Config::default(),
2059 vec![set],
2060 marque_capco::scheme::CapcoScheme::new(),
2061 Box::new(FixedClock::new(
2062 UNIX_EPOCH + Duration::from_secs(1_700_000_000),
2063 )),
2064 )
2065 .expect("default CAPCO scheme has no rewrite cycles");
2066
2067 let lint = engine.lint(TEST_SRC);
2068 assert_eq!(lint.diagnostics.len(), 1);
2069 // Severity stays Suggest (post-pass leaves explicit Suggest alone).
2070 assert_eq!(lint.diagnostics[0].severity, Severity::Suggest);
2071 // Even at confidence 1.0, a Suggest-severity fix must not auto-apply.
2072 let fix_result = engine.fix(TEST_SRC, FixMode::Apply);
2073 assert_eq!(
2074 fix_result.applied.len(),
2075 0,
2076 "explicit Suggest-severity fix must not auto-apply regardless of confidence"
2077 );
2078 }
2079
2080 #[test]
2081 fn confidence_at_default_threshold_is_included() {
2082 // A fix at exactly 0.95 must be applied (inclusive threshold).
2083 let engine = engine_with(vec![proposal_with_confidence("E001", 0, 6, "AA", 0.95)]);
2084 let result = engine.fix(TEST_SRC, FixMode::Apply);
2085 assert_eq!(result.applied.len(), 1);
2086 }
2087
2088 // M-5: the zero-length-span filter (`!f.span.is_empty()`) in fix_inner
2089 // is what masked the Phase 2 Span::new(0, 0) placeholders from the
2090 // C-1 overlap guard. This test pins that guard explicitly so a future
2091 // refactor that drops the filter is caught.
2092 #[test]
2093 fn zero_length_span_fix_is_filtered_before_sort() {
2094 let engine = engine_with(vec![proposal("E001", 5, 5, "X")]);
2095 let result = engine.fix(TEST_SRC, FixMode::Apply);
2096 assert_eq!(result.applied.len(), 0);
2097 // Source unchanged: no splice was attempted.
2098 assert_eq!(result.source, TEST_SRC);
2099 }
2100
2101 // L-4: all the other threshold tests go through fix_with_threshold
2102 // (override path). This exercises the Config-supplied path explicitly
2103 // so both branches of `fix_with_threshold_inner`'s threshold selection
2104 // are covered.
2105 #[test]
2106 fn config_supplied_threshold_filters_proposals() {
2107 let mut config = Config::default();
2108 config.set_confidence_threshold(0.5).unwrap();
2109 let engine = engine_with_config(
2110 config,
2111 vec![
2112 proposal_with_confidence("E001", 0, 6, "AA", 0.4), // below
2113 proposal_with_confidence("E002", 8, 14, "BB", 0.6), // above
2114 ],
2115 );
2116 let result = engine.fix(TEST_SRC, FixMode::Apply);
2117 // Only the 0.6 fix is applied.
2118 assert_eq!(result.applied.len(), 1);
2119 assert_eq!(result.applied[0].proposal.rule.as_str(), "E002");
2120 // The 0.4 fix surfaces as a remaining diagnostic.
2121 assert_eq!(result.remaining_diagnostics.len(), 1);
2122 }
2123
2124 // Phase 3 Task 2: PageBreak candidates must reset the engine's
2125 // PageContext accumulator. Without this, banner-validation rules on
2126 // the second page would see portions from the first page, producing
2127 // over-restrictive expected aggregates.
2128 #[test]
2129 fn lint_handles_multi_page_document_with_form_feed() {
2130 let src: &[u8] = b"(SECRET//NOFORN) page 1 body.\nSECRET//NOFORN\n\x0c(CONFIDENTIAL) page 2 body.\nCONFIDENTIAL\n";
2131 let engine = engine_with(vec![]);
2132 let result = engine.lint(src);
2133 // Stub rule with no proposals: clean lint, no panic, no parser
2134 // error from the page-break candidate (which is filtered before
2135 // parser.parse is called).
2136 assert!(result.is_clean());
2137 }
2138
2139 // F.1: PageContext reset semantics are observable.
2140 //
2141 // ContextRecorderRule captures the live `page_context.portion_count()`
2142 // every time it's invoked. By running the engine over a multi-page
2143 // document and inspecting the captured counts at each banner candidate,
2144 // we prove that the engine resets PageContext at the page break instead
2145 // of accumulating across pages.
2146 #[derive(Clone)]
2147 struct ContextRecorderRule {
2148 observations: std::sync::Arc<std::sync::Mutex<Vec<(marque_ism::MarkingType, usize)>>>,
2149 }
2150
2151 impl Rule for ContextRecorderRule {
2152 fn id(&self) -> RuleId {
2153 RuleId::new("RECORD")
2154 }
2155 fn name(&self) -> &'static str {
2156 "page-context-recorder"
2157 }
2158 fn default_severity(&self) -> Severity {
2159 Severity::Warn
2160 }
2161 fn check(&self, _attrs: &IsmAttributes, ctx: &RuleContext) -> Vec<Diagnostic> {
2162 let count = ctx
2163 .page_context
2164 .as_ref()
2165 .map(|pc| pc.portion_count())
2166 .unwrap_or(0);
2167 self.observations
2168 .lock()
2169 .unwrap()
2170 .push((ctx.marking_type, count));
2171 vec![]
2172 }
2173 }
2174
2175 struct RecorderSet(Vec<Box<dyn Rule>>);
2176 impl RuleSet for RecorderSet {
2177 fn rules(&self) -> &[Box<dyn Rule>] {
2178 &self.0
2179 }
2180 fn schema_version(&self) -> &'static str {
2181 "TEST"
2182 }
2183 }
2184
2185 #[test]
2186 fn page_context_resets_observably_across_form_feed() {
2187 use marque_ism::MarkingType;
2188 let observations = std::sync::Arc::new(std::sync::Mutex::new(Vec::new()));
2189 let rule = ContextRecorderRule {
2190 observations: std::sync::Arc::clone(&observations),
2191 };
2192 let set: Box<dyn RuleSet> = Box::new(RecorderSet(vec![Box::new(rule)]));
2193 let engine = Engine::with_clock(
2194 Config::default(),
2195 vec![set],
2196 marque_capco::scheme::CapcoScheme::new(),
2197 Box::new(FixedClock::new(
2198 UNIX_EPOCH + Duration::from_secs(1_700_000_000),
2199 )),
2200 )
2201 .expect("default CAPCO scheme has no rewrite cycles");
2202
2203 // Two pages, separated by a form feed:
2204 // Page 1: one portion + one banner
2205 // Page break (\f)
2206 // Page 2: one portion + one banner
2207 //
2208 // The recorder fires on every candidate that reaches the rule loop.
2209 // For the page-1 banner we expect to see 1 accumulated portion.
2210 // For the page-2 banner we expect to see 1 accumulated portion
2211 // (NOT 2) — the form feed must have reset the context.
2212 let src: &[u8] = b"(SECRET//NF) p1 text\nSECRET//NOFORN\n\x0c(CONFIDENTIAL//NF) p2\nCONFIDENTIAL//NOFORN\n";
2213 let _ = engine.lint(src);
2214
2215 let obs = observations.lock().unwrap();
2216 // The recorder ran once per non-PageBreak candidate. Filter to
2217 // banners and check the page_context count each banner saw.
2218 let banner_counts: Vec<usize> = obs
2219 .iter()
2220 .filter(|(kind, _)| *kind == MarkingType::Banner)
2221 .map(|(_, count)| *count)
2222 .collect();
2223 assert_eq!(
2224 banner_counts.len(),
2225 2,
2226 "expected 2 banner observations, got: {obs:?}"
2227 );
2228 assert_eq!(
2229 banner_counts[0], 1,
2230 "page-1 banner should see 1 accumulated portion"
2231 );
2232 assert_eq!(
2233 banner_counts[1], 1,
2234 "page-2 banner should see 1 accumulated portion (the page-1 \
2235 portion must be cleared by the form feed)"
2236 );
2237 }
2238
2239 #[test]
2240 fn page_context_lint_starts_fresh_on_each_call() {
2241 // Calling Engine::lint twice on the same engine must produce a
2242 // fresh PageContext for the second call — no cross-call accumulation.
2243 use marque_ism::MarkingType;
2244 let observations = std::sync::Arc::new(std::sync::Mutex::new(Vec::new()));
2245 let rule = ContextRecorderRule {
2246 observations: std::sync::Arc::clone(&observations),
2247 };
2248 let set: Box<dyn RuleSet> = Box::new(RecorderSet(vec![Box::new(rule)]));
2249 let engine = Engine::with_clock(
2250 Config::default(),
2251 vec![set],
2252 marque_capco::scheme::CapcoScheme::new(),
2253 Box::new(FixedClock::new(
2254 UNIX_EPOCH + Duration::from_secs(1_700_000_000),
2255 )),
2256 )
2257 .expect("default CAPCO scheme has no rewrite cycles");
2258 let src: &[u8] = b"(SECRET//NF) text\nSECRET//NOFORN\n";
2259 let _ = engine.lint(src);
2260 let _ = engine.lint(src);
2261
2262 let obs = observations.lock().unwrap();
2263 // Both calls should see identical observations — if the second
2264 // call leaked state from the first, the page-2 banner_count would
2265 // double.
2266 let banner_counts: Vec<usize> = obs
2267 .iter()
2268 .filter(|(kind, _)| *kind == MarkingType::Banner)
2269 .map(|(_, count)| *count)
2270 .collect();
2271 assert_eq!(
2272 banner_counts.len(),
2273 2,
2274 "two lint calls should produce two banner observations"
2275 );
2276 assert_eq!(banner_counts, vec![1, 1]);
2277 }
2278
2279 // M6: FR-016 tiebreaker — same span, different rule IDs.
2280 // The sort is (span.end DESC, span.start DESC, rule_id ASC, replacement ASC).
2281 // When two fixes target the exact same span, rule_id ASC breaks the tie,
2282 // and C-1 drops the second (overlapping) fix.
2283 #[test]
2284 fn fr016_same_span_different_rule_ids_picks_lower_rule_id() {
2285 // Two proposals for span 0..6 with different rule IDs.
2286 // "C001" < "E001" lexicographically, so C001 is kept and E001 dropped.
2287 let engine = engine_with(vec![
2288 proposal("E001", 0, 6, "BB"),
2289 proposal("C001", 0, 6, "AA"),
2290 ]);
2291 let result = engine.fix(TEST_SRC, FixMode::Apply);
2292 assert_eq!(result.applied.len(), 1);
2293 assert_eq!(result.applied[0].proposal.rule.as_str(), "C001");
2294 assert_eq!(result.applied[0].proposal.replacement.as_ref(), "AA");
2295 }
2296
2297 // FR-016 tiebreaker — same span, same rule ID, different replacements.
2298 #[test]
2299 fn fr016_same_span_same_rule_picks_lower_replacement() {
2300 let engine = engine_with(vec![
2301 proposal("E001", 0, 6, "ZZZ"),
2302 proposal("E001", 0, 6, "AAA"),
2303 ]);
2304 let result = engine.fix(TEST_SRC, FixMode::Apply);
2305 assert_eq!(result.applied.len(), 1);
2306 assert_eq!(result.applied[0].proposal.replacement.as_ref(), "AAA");
2307 }
2308
2309 // -----------------------------------------------------------------------
2310 // Task #49 — rule-alias canonicalization + fail-loud on unknown keys
2311 // -----------------------------------------------------------------------
2312
2313 /// Stub rule with distinct, test-controlled id and name so we can
2314 /// exercise the alias-resolution logic. The base `StubRule` hardcodes
2315 /// `name() -> "stub"`, which collides across multiple rules and
2316 /// doesn't model real CAPCO rules.
2317 struct NamedStub {
2318 id: &'static str,
2319 name: &'static str,
2320 }
2321
2322 impl Rule for NamedStub {
2323 fn id(&self) -> RuleId {
2324 RuleId::new(self.id)
2325 }
2326 fn name(&self) -> &'static str {
2327 self.name
2328 }
2329 fn default_severity(&self) -> Severity {
2330 Severity::Warn
2331 }
2332 fn check(&self, _attrs: &IsmAttributes, _ctx: &RuleContext) -> Vec<Diagnostic> {
2333 vec![]
2334 }
2335 }
2336
2337 fn named_rule_set(rules: &[(&'static str, &'static str)]) -> Box<dyn RuleSet> {
2338 let rules: Vec<Box<dyn Rule>> = rules
2339 .iter()
2340 .map(|(id, name)| Box::new(NamedStub { id, name }) as Box<dyn Rule>)
2341 .collect();
2342 Box::new(StubSet(rules))
2343 }
2344
2345 fn config_with_overrides(pairs: &[(&str, &str)]) -> Config {
2346 let mut config = Config::default();
2347 for (k, v) in pairs {
2348 config
2349 .rules
2350 .overrides
2351 .insert((*k).to_owned(), (*v).to_owned());
2352 }
2353 config
2354 }
2355
2356 #[test]
2357 fn canonicalize_accepts_rule_id_form_unchanged() {
2358 let mut config = config_with_overrides(&[("E001", "warn")]);
2359 let sets = vec![named_rule_set(&[("E001", "portion-mark-in-banner")])];
2360 canonicalize_rule_overrides(&mut config, &sets).expect("should succeed");
2361 assert_eq!(
2362 config.rules.overrides.get("E001"),
2363 Some(&"warn".to_owned()),
2364 "ID-form override keeps its key"
2365 );
2366 }
2367
2368 #[test]
2369 fn canonicalize_accepts_rule_name_form_and_resolves_to_id() {
2370 let mut config = config_with_overrides(&[("portion-mark-in-banner", "error")]);
2371 let sets = vec![named_rule_set(&[("E001", "portion-mark-in-banner")])];
2372 canonicalize_rule_overrides(&mut config, &sets).expect("should succeed");
2373 assert_eq!(
2374 config.rules.overrides.get("E001"),
2375 Some(&"error".to_owned()),
2376 "name-form override resolves to canonical ID"
2377 );
2378 assert!(
2379 !config
2380 .rules
2381 .overrides
2382 .contains_key("portion-mark-in-banner"),
2383 "pre-canonicalization name key must not survive"
2384 );
2385 }
2386
2387 #[test]
2388 fn canonicalize_rejects_unknown_key_with_suggestion_for_near_miss() {
2389 let mut config = config_with_overrides(&[("E00l", "warn")]); // lowercase-L, not 1
2390 let sets = vec![named_rule_set(&[("E001", "portion-mark-in-banner")])];
2391 let err = canonicalize_rule_overrides(&mut config, &sets).unwrap_err();
2392 match err {
2393 EngineConstructionError::UnknownRuleOverride { key, did_you_mean } => {
2394 assert_eq!(key, "E00l");
2395 assert_eq!(
2396 did_you_mean.as_deref(),
2397 Some("E001"),
2398 "single-character typo should suggest the canonical ID"
2399 );
2400 }
2401 other => panic!("expected UnknownRuleOverride, got {other:?}"),
2402 }
2403 }
2404
2405 #[test]
2406 fn canonicalize_rejects_unknown_key_without_suggestion_when_nothing_close() {
2407 // No candidate is within edit distance 3, so did_you_mean must be None
2408 // — a nonsense suggestion is worse than no suggestion.
2409 let mut config = config_with_overrides(&[("totally-made-up-rule-name", "error")]);
2410 let sets = vec![named_rule_set(&[("E001", "portion-mark-in-banner")])];
2411 let err = canonicalize_rule_overrides(&mut config, &sets).unwrap_err();
2412 match err {
2413 EngineConstructionError::UnknownRuleOverride { key, did_you_mean } => {
2414 assert_eq!(key, "totally-made-up-rule-name");
2415 assert!(
2416 did_you_mean.is_none(),
2417 "distant misses must not emit a suggestion; got {did_you_mean:?}"
2418 );
2419 }
2420 other => panic!("expected UnknownRuleOverride, got {other:?}"),
2421 }
2422 }
2423
2424 #[test]
2425 fn canonicalize_rejects_conflicting_id_and_name_forms_with_different_severity() {
2426 let mut config =
2427 config_with_overrides(&[("E001", "warn"), ("portion-mark-in-banner", "error")]);
2428 let sets = vec![named_rule_set(&[("E001", "portion-mark-in-banner")])];
2429 let err = canonicalize_rule_overrides(&mut config, &sets).unwrap_err();
2430 match err {
2431 EngineConstructionError::ConflictingRuleOverride {
2432 rule_id,
2433 keys,
2434 severities,
2435 } => {
2436 assert_eq!(rule_id, "E001");
2437 // HashMap iteration order isn't deterministic — verify by set.
2438 let k: std::collections::HashSet<&str> = keys.iter().map(|s| s.as_str()).collect();
2439 assert!(k.contains("E001"));
2440 assert!(k.contains("portion-mark-in-banner"));
2441 let s: std::collections::HashSet<&str> =
2442 severities.iter().map(|s| s.as_str()).collect();
2443 assert!(s.contains("warn"));
2444 assert!(s.contains("error"));
2445 }
2446 other => panic!("expected ConflictingRuleOverride, got {other:?}"),
2447 }
2448 }
2449
2450 #[test]
2451 fn canonicalize_accepts_duplicate_forms_with_same_severity() {
2452 // A user who writes both `E001 = "warn"` and `portion-mark-in-banner
2453 // = "warn"` (e.g., via copy-paste across layers) is unambiguous and
2454 // should not be punished.
2455 let mut config =
2456 config_with_overrides(&[("E001", "warn"), ("portion-mark-in-banner", "warn")]);
2457 let sets = vec![named_rule_set(&[("E001", "portion-mark-in-banner")])];
2458 canonicalize_rule_overrides(&mut config, &sets)
2459 .expect("duplicate forms with same severity must succeed");
2460 assert_eq!(config.rules.overrides.len(), 1);
2461 assert_eq!(config.rules.overrides.get("E001"), Some(&"warn".to_owned()));
2462 }
2463
2464 #[test]
2465 fn canonicalize_accepts_overrides_across_multiple_rule_sets() {
2466 // Two rule sets registered; aliases from each must resolve.
2467 let mut config = config_with_overrides(&[
2468 ("portion-mark-in-banner", "error"), // name from set A
2469 ("M500", "warn"), // ID from set B
2470 ]);
2471 let sets = vec![
2472 named_rule_set(&[("E001", "portion-mark-in-banner")]),
2473 named_rule_set(&[("M500", "some-other-domain-rule")]),
2474 ];
2475 canonicalize_rule_overrides(&mut config, &sets).expect("should succeed");
2476 assert_eq!(
2477 config.rules.overrides.get("E001"),
2478 Some(&"error".to_owned())
2479 );
2480 assert_eq!(config.rules.overrides.get("M500"), Some(&"warn".to_owned()));
2481 }
2482
2483 #[test]
2484 fn canonicalize_empty_overrides_is_noop() {
2485 let mut config = Config::default();
2486 let sets = vec![named_rule_set(&[("E001", "portion-mark-in-banner")])];
2487 canonicalize_rule_overrides(&mut config, &sets).expect("empty overrides must succeed");
2488 assert!(config.rules.overrides.is_empty());
2489 }
2490
2491 #[test]
2492 fn unknown_rule_override_exit_code_is_dataerr() {
2493 let err = EngineConstructionError::UnknownRuleOverride {
2494 key: "E999".into(),
2495 did_you_mean: None,
2496 };
2497 assert_eq!(err.exit_code(), 65, "EX_DATAERR for user-config errors");
2498 }
2499
2500 #[test]
2501 fn conflicting_rule_override_exit_code_is_dataerr() {
2502 let err = EngineConstructionError::ConflictingRuleOverride {
2503 rule_id: "E001".into(),
2504 keys: Box::new(["E001".into(), "portion-mark-in-banner".into()]),
2505 severities: Box::new(["warn".into(), "error".into()]),
2506 };
2507 assert_eq!(err.exit_code(), 65);
2508 }
2509
2510 #[test]
2511 fn rewrite_cycle_exit_code_is_unavailable() {
2512 // Scheme defects (not user-config errors) stay on EX_UNAVAILABLE.
2513 use marque_scheme::CategoryId;
2514 let err = EngineConstructionError::RewriteCycle {
2515 axis: CategoryId(0),
2516 members: Box::new(["a", "b"]),
2517 };
2518 assert_eq!(err.exit_code(), 69);
2519 }
2520
2521 #[test]
2522 fn levenshtein_matches_reference_values() {
2523 // Spot-check against hand-computed distances to catch regressions
2524 // in the DP implementation.
2525 assert_eq!(super::levenshtein("", ""), 0);
2526 assert_eq!(super::levenshtein("E001", "E001"), 0);
2527 assert_eq!(super::levenshtein("E001", "E002"), 1);
2528 assert_eq!(super::levenshtein("E001", "E00l"), 1);
2529 assert_eq!(super::levenshtein("kitten", "sitting"), 3);
2530 assert_eq!(super::levenshtein("", "abc"), 3);
2531 assert_eq!(super::levenshtein("abc", ""), 3);
2532 }
2533
2534 #[test]
2535 fn suggest_closest_prefers_smaller_distance() {
2536 let cands = ["E001", "E002", "E010"];
2537 // "E00l" has dist 1 to E001 and dist 1 to E002 (single substitution),
2538 // and dist 2 to E010. E001 should win the tie-break because it appears
2539 // first among the equally close candidates.
2540 assert_eq!(
2541 super::suggest_closest("E00l", cands.iter().copied()),
2542 Some("E001".to_owned())
2543 );
2544 }
2545
2546 #[test]
2547 fn suggest_closest_returns_none_when_nothing_is_close_enough() {
2548 let cands = ["portion-mark-in-banner", "missing-usa-trigraph"];
2549 // Very short needle with no near neighbors — threshold is 1 for
2550 // length 3, and the closest candidate is many edits away.
2551 assert!(super::suggest_closest("xyz", cands.iter().copied()).is_none());
2552 }
2553}