marque_engine/engine.rs
1//! `Engine` — the configured, ready-to-run pipeline.
2
3use crate::clock::{Clock, SystemClock};
4use crate::output::{FixResult, LintResult};
5use aho_corasick::AhoCorasick;
6use marque_config::Config;
7use marque_ism::Span;
8use marque_rules::{AppliedFix, Diagnostic, FixProposal, FixSource, RuleId, RuleSet, Severity};
9use std::collections::HashMap;
10use std::sync::Arc;
11
12/// Whether to apply fixes or just simulate (dry-run).
13#[derive(Debug, Clone, Copy, PartialEq, Eq)]
14pub enum FixMode {
15 /// Apply fixes to the source text.
16 Apply,
17 /// Simulate fixes — audit stream is identical but source is unchanged.
18 DryRun,
19}
20
21/// Error returned when a caller supplies a runtime confidence threshold
22/// override that is outside the valid `[0.0, 1.0]` range.
23#[derive(Debug, Clone, Copy, PartialEq)]
24pub struct InvalidThreshold(pub f32);
25
26impl std::fmt::Display for InvalidThreshold {
27 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
28 write!(
29 f,
30 "confidence threshold {} is outside [0.0, 1.0] or is NaN",
31 self.0
32 )
33 }
34}
35
36impl std::error::Error for InvalidThreshold {}
37
38/// A configured engine instance.
39pub struct Engine {
40 config: Config,
41 rule_sets: Vec<Box<dyn RuleSet>>,
42 clock: Box<dyn Clock>,
43 /// Corrections map wrapped in Arc once at construction time so that each
44 /// `RuleContext` clone in `lint()` is an O(1) refcount bump, not a
45 /// deep-clone of the entire HashMap.
46 corrections_arc: Option<Arc<HashMap<String, String>>>,
47 /// Pre-built Aho-Corasick automaton for pre-scanner text corrections.
48 /// Built once at construction time from the corrections map (excluding
49 /// no-op and "//" entries). `None` when the corrections map is empty or
50 /// all entries are filtered out.
51 corrections_ac: Option<CachedAhoCorasick>,
52}
53
54/// Cached AhoCorasick automaton + the active (key, value) pairs that
55/// correspond to its pattern indices.
56struct CachedAhoCorasick {
57 ac: AhoCorasick,
58 /// Active correction pairs, indexed by `PatternID::as_usize()`.
59 active: Vec<(Box<str>, Box<str>)>,
60}
61
62impl Engine {
63 /// Create a new engine with the given configuration and rule sets.
64 pub fn new(config: Config, rule_sets: Vec<Box<dyn RuleSet>>) -> Self {
65 Self::with_clock(config, rule_sets, Box::new(SystemClock))
66 }
67
68 /// Create an engine with a custom clock (for deterministic tests).
69 pub fn with_clock(
70 mut config: Config,
71 rule_sets: Vec<Box<dyn RuleSet>>,
72 clock: Box<dyn Clock>,
73 ) -> Self {
74 // Take ownership of the corrections map instead of cloning —
75 // nothing reads config.corrections after construction.
76 let corrections_arc = if config.corrections.is_empty() {
77 None
78 } else {
79 Some(Arc::new(std::mem::take(&mut config.corrections)))
80 };
81
82 // Pre-build the AhoCorasick automaton for pre-scanner text corrections.
83 // This is O(total pattern bytes) and done once, not per-lint call.
84 let corrections_ac = corrections_arc.as_ref().and_then(|corrections| {
85 // Sort by key for deterministic pattern ordering — HashMap
86 // iteration order is random (hash seed varies per process),
87 // and AhoCorasick pattern IDs depend on insertion order.
88 let mut active: Vec<(Box<str>, Box<str>)> = corrections
89 .iter()
90 .filter(|(k, v)| k != v && k.as_str() != "//")
91 .map(|(k, v)| (k.as_str().into(), v.as_str().into()))
92 .collect();
93 active.sort_by(|(a, _), (b, _)| a.cmp(b));
94 if active.is_empty() {
95 return None;
96 }
97 let patterns: Vec<&str> = active.iter().map(|(k, _)| k.as_ref()).collect();
98 match AhoCorasick::new(&patterns) {
99 Ok(ac) => Some(CachedAhoCorasick { ac, active }),
100 Err(e) => {
101 tracing::warn!(
102 "failed to build AhoCorasick automaton for corrections map \
103 ({} patterns): {e}; pre-scanner text corrections disabled",
104 patterns.len()
105 );
106 None
107 }
108 }
109 });
110
111 Self {
112 config,
113 rule_sets,
114 clock,
115 corrections_arc,
116 corrections_ac,
117 }
118 }
119
120 /// Lint a UTF-8 text buffer. Returns diagnostics without modifying input.
121 pub fn lint(&self, source: &[u8]) -> LintResult {
122 use marque_core::{Parser, Scanner};
123 use marque_ism::{CapcoTokenSet, MarkingType, PageContext};
124 use marque_rules::RuleContext;
125
126 let token_set = CapcoTokenSet;
127 let parser = Parser::new(&token_set);
128 let candidates = Scanner::scan(source);
129
130 // corrections_arc was built once at Engine construction; each clone here
131 // is an O(1) refcount bump.
132 let corrections_arc = self.corrections_arc.clone();
133
134 let mut diagnostics = Vec::new();
135 // Build page context by accumulating portion markings in document order.
136 // Banner and CAB rules receive this context so they can validate the
137 // observed banner against the expected composite. Phase 3 wires the
138 // page-break reset below — the scanner emits a `MarkingType::PageBreak`
139 // candidate at every form-feed and at every `\n\n\n+` run; on each
140 // such candidate we drop the accumulator and start a fresh page.
141 let mut page_context = PageContext::new();
142 // Cache the current Arc<PageContext> so that consecutive banner/CAB
143 // candidates on the same page share a single allocation. The cache is
144 // invalidated (set to None) whenever a new portion is accumulated or
145 // a page break resets the context.
146 let mut page_context_arc: Option<Arc<PageContext>> = None;
147
148 for candidate in &candidates {
149 // Page-break candidates are scanner-emitted boundaries with no
150 // parsable content. Reset the context BEFORE attempting to parse
151 // — otherwise the parser's MalformedMarking error would skip the
152 // continue and leave us accumulating across pages.
153 if candidate.kind == MarkingType::PageBreak {
154 page_context = PageContext::new();
155 page_context_arc = None;
156 continue;
157 }
158
159 let Ok(parsed) = parser.parse(candidate, source) else {
160 continue;
161 };
162
163 // Accumulate portions before running banner/CAB rules so that
164 // when we reach a banner candidate the context already reflects
165 // all preceding portion data.
166 if parsed.kind == MarkingType::Portion {
167 page_context.add_portion(parsed.attrs.clone());
168 // Invalidate the cached Arc so the next banner/CAB gets a
169 // fresh snapshot. We rebuild it lazily below.
170 page_context_arc = None;
171 }
172
173 // Phase 3: zone and position are Option-typed and stay None
174 // until a structural scanner pass can prove them. The previous
175 // hardcoded `Zone::Body`/`DocumentPosition::Body` was a silent
176 // lie to any future rule that read them.
177 let ctx_page = if parsed.kind != MarkingType::Portion && !page_context.is_empty() {
178 // Lazily wrap the accumulated context in an Arc once per
179 // page-context snapshot; subsequent banner/CAB candidates on
180 // the same page clone only the cheap Arc pointer.
181 Some(
182 page_context_arc
183 .get_or_insert_with(|| Arc::new(page_context.clone()))
184 .clone(),
185 )
186 } else {
187 None
188 };
189 let ctx = RuleContext {
190 marking_type: candidate.kind,
191 zone: None,
192 position: None,
193 page_context: ctx_page,
194 corrections: corrections_arc.clone(),
195 };
196 for rule_set in &self.rule_sets {
197 for rule in rule_set.rules() {
198 // Skip rules that are configured as Off.
199 let configured_severity = self
200 .config
201 .rules
202 .overrides
203 .get(rule.id().as_str())
204 .and_then(|s| Severity::parse_config(s))
205 .unwrap_or(rule.default_severity());
206
207 if configured_severity == Severity::Off {
208 continue;
209 }
210
211 let mut diags = rule.check(&parsed.attrs, &ctx);
212 // Apply configured severity override.
213 for d in &mut diags {
214 d.severity = configured_severity;
215 }
216 diagnostics.extend(diags);
217 }
218 }
219 }
220
221 // Pre-scanner text corrections: scan the raw source for
222 // corrections-map keys that the scanner missed (e.g., "SERCET" is
223 // not a known classification prefix, so the scanner never detects
224 // "SERCET//NF" as a candidate, and C001 never sees the token).
225 //
226 // This pass emits C001 diagnostics for raw-text matches that don't
227 // overlap with any C001 diagnostic already produced by the rule
228 // pipeline above. Spans reference the original source buffer.
229 if let Some(cached) = &self.corrections_ac {
230 let c001_severity = self
231 .config
232 .rules
233 .overrides
234 .get("C001")
235 .and_then(|s| Severity::parse_config(s))
236 .unwrap_or(Severity::Fix);
237
238 if c001_severity != Severity::Off {
239 // Collect spans already covered by rule-pipeline C001.
240 let existing_c001_spans: std::collections::HashSet<Span> = diagnostics
241 .iter()
242 .filter(|d| d.rule.as_str() == "C001")
243 .map(|d| d.span)
244 .collect();
245
246 // Use the pre-built AhoCorasick automaton to scan the full
247 // source in a single O(n + m) pass. The automaton and its
248 // active pairs were built once at Engine construction time.
249 for mat in cached.ac.find_iter(source) {
250 let span = Span::new(mat.start(), mat.end());
251 let (ref key, ref value) = cached.active[mat.pattern().as_usize()];
252
253 // Skip if the rule pipeline already produced a C001
254 // diagnostic for this exact span.
255 if !existing_c001_spans.contains(&span) {
256 let proposal = FixProposal::new(
257 RuleId::new("C001"),
258 FixSource::CorrectionsMap,
259 span,
260 key.as_ref(),
261 value.as_ref(),
262 1.0,
263 None,
264 );
265 diagnostics.push(Diagnostic::new(
266 RuleId::new("C001"),
267 c001_severity,
268 span,
269 format!("corrections map: {key:?} → {value:?}"),
270 "CONFIG:[corrections]",
271 Some(proposal),
272 ));
273 }
274 }
275 }
276 }
277
278 LintResult { diagnostics }
279 }
280
281 /// Lint and apply fixes. Returns fixed source and audit log.
282 ///
283 /// Fix application order follows FR-016: `(span.end DESC, span.start DESC,
284 /// rule_id ASC, replacement ASC)` so reverse-byte application preserves
285 /// earlier-span offsets and equal-span ties break deterministically.
286 ///
287 /// Uses the confidence threshold configured in the engine's `Config`.
288 /// To supply a per-call override (e.g., from a `--confidence` CLI flag
289 /// or an HTTP request field), use [`Engine::fix_with_threshold`].
290 pub fn fix(&self, source: &[u8], mode: FixMode) -> FixResult {
291 // The config threshold is pre-validated at load time, so the
292 // `Result` branch is unreachable.
293 self.fix_with_threshold(source, mode, None)
294 .expect("config-supplied confidence threshold is pre-validated")
295 }
296
297 /// Lint and apply fixes using an optional per-call confidence threshold.
298 ///
299 /// When `threshold_override` is `Some`, it replaces the config-level
300 /// threshold for this call only and is validated against `[0.0, 1.0]`.
301 /// When `None`, the engine falls back to `Config::confidence_threshold`.
302 pub fn fix_with_threshold(
303 &self,
304 source: &[u8],
305 mode: FixMode,
306 threshold_override: Option<f32>,
307 ) -> Result<FixResult, InvalidThreshold> {
308 let threshold = match threshold_override {
309 Some(value) => {
310 if !(0.0..=1.0).contains(&value) || value.is_nan() {
311 return Err(InvalidThreshold(value));
312 }
313 value
314 }
315 None => self.config.confidence_threshold(),
316 };
317
318 Ok(self.fix_inner(source, mode, threshold))
319 }
320
321 fn fix_inner(&self, source: &[u8], mode: FixMode, threshold: f32) -> FixResult {
322 use std::collections::HashSet;
323
324 // Two-pass fix strategy for pre-scanner text corrections.
325 //
326 // Pass 1: lint the original source. The pre-scanner text scan may
327 // produce C001 diagnostics for corrections-map matches the scanner
328 // missed (e.g., "SERCET" is not a known classification prefix).
329 // Apply those C001 fixes to produce an intermediate source.
330 //
331 // Pass 2: re-lint the intermediate source. The scanner now detects
332 // the corrected marking (e.g., "SECRET//NF") and additional rules
333 // fire (e.g., E001 on NF→NOFORN). Apply those fixes on top.
334 //
335 // Without this, the spec scenario "SERCET//NF → SECRET//NOFORN"
336 // would stop at "SECRET//NF".
337 let lint1 = self.lint(source);
338 let (effective_source, pass1_applied) =
339 self.apply_text_corrections(source, &lint1, threshold, mode);
340
341 let lint = if !pass1_applied.is_empty() {
342 // Re-lint the corrected source so the scanner picks up newly-valid markings.
343 self.lint(&effective_source)
344 } else {
345 lint1
346 };
347
348 let mut fixes: Vec<_> = lint
349 .diagnostics
350 .iter()
351 .filter_map(|d| d.fix.as_ref())
352 .filter(|f| f.confidence >= threshold)
353 .filter(|f| !f.span.is_empty())
354 .collect();
355
356 // FR-016: deterministic total-order fix application.
357 // Sort by (span.end DESC, span.start DESC, rule_id ASC, replacement ASC).
358 fixes.sort_by(|a, b| {
359 b.span
360 .end
361 .cmp(&a.span.end)
362 .then(b.span.start.cmp(&a.span.start))
363 .then(a.rule.cmp(&b.rule))
364 .then(a.replacement.cmp(&b.replacement))
365 });
366
367 // C-1: overlap guard. After the FR-016 sort, two fixes can still
368 // touch the same byte range if multiple rules emit a fix for the
369 // same span (or overlapping spans). Applying both via `splice`
370 // would silently corrupt the byte stream. We keep the first fix
371 // per span (which under FR-016 ordering is deterministic) and
372 // surface the dropped fixes through `remaining_diagnostics`.
373 //
374 // The walk is over fixes in reverse-end order, so a fix is kept
375 // only if its `span.end` is at or below the previous kept fix's
376 // `span.start` — i.e., strictly to the left, no overlap.
377 let mut kept_fixes: Vec<&marque_rules::FixProposal> = Vec::with_capacity(fixes.len());
378 let mut next_window_end: Option<usize> = None;
379 for fix in &fixes {
380 let fits = match next_window_end {
381 Some(boundary) => fix.span.end <= boundary,
382 None => true,
383 };
384 if fits {
385 next_window_end = Some(fix.span.start);
386 kept_fixes.push(*fix);
387 }
388 }
389
390 // M-4: hold the classifier id in an `Arc<str>` so cloning into each
391 // applied-fix audit record is an O(1) refcount bump rather than a
392 // full string copy per fix.
393 let classifier_id: Option<std::sync::Arc<str>> = self
394 .config
395 .user
396 .classifier_id
397 .as_deref()
398 .map(std::sync::Arc::from);
399 let dry_run = mode == FixMode::DryRun;
400 let now = self.clock.now();
401
402 // H-7: applied-fix lookup is keyed by (RuleId, Span). Use a HashSet
403 // so the per-diagnostic filter at the bottom of this function is
404 // O(1) per query instead of O(n) over a Vec.
405 let mut applied_keys: HashSet<(RuleId, Span)> = HashSet::with_capacity(kept_fixes.len());
406 let mut applied: Vec<AppliedFix> = Vec::with_capacity(kept_fixes.len());
407
408 // Only allocate the output buffer when we actually need to mutate it.
409 // Dry-run returns the original source verbatim.
410 let output = match mode {
411 FixMode::Apply => {
412 let mut buf = effective_source.clone();
413 for fix in kept_fixes {
414 buf.splice(fix.span.start..fix.span.end, fix.replacement.bytes());
415 applied_keys.insert((fix.rule.clone(), fix.span));
416 applied.push(AppliedFix::__engine_promote(
417 fix.clone(),
418 now,
419 classifier_id.clone(),
420 dry_run,
421 None, // input identifier set by CLI at the boundary
422 ));
423 }
424 buf
425 }
426 FixMode::DryRun => {
427 for fix in kept_fixes {
428 applied_keys.insert((fix.rule.clone(), fix.span));
429 applied.push(AppliedFix::__engine_promote(
430 fix.clone(),
431 now,
432 classifier_id.clone(),
433 dry_run,
434 None,
435 ));
436 }
437 source.to_vec()
438 }
439 };
440
441 // Prepend pass-1 text corrections to the applied list so they
442 // appear in the audit trail.
443 let mut all_applied = pass1_applied;
444 all_applied.extend(applied);
445
446 // Remaining diagnostics: those whose fix was not applied.
447 // Filter by (rule_id, span) pair — not just rule ID — so that if
448 // rule E001 fires on three spans and only one is fixed, the other
449 // two remain.
450 let remaining_diagnostics = lint
451 .diagnostics
452 .into_iter()
453 .filter(|d| {
454 !d.fix
455 .as_ref()
456 .is_some_and(|f| applied_keys.contains(&(f.rule.clone(), f.span)))
457 })
458 .collect();
459
460 FixResult {
461 source: output,
462 applied: all_applied,
463 remaining_diagnostics,
464 }
465 }
466
467 /// Apply pre-scanner text corrections (C001) from lint diagnostics and
468 /// return the corrected source + applied fixes. Used by `fix_inner` to
469 /// produce an intermediate source that the scanner can detect.
470 fn apply_text_corrections(
471 &self,
472 source: &[u8],
473 lint: &LintResult,
474 threshold: f32,
475 mode: FixMode,
476 ) -> (Vec<u8>, Vec<AppliedFix>) {
477 let mut text_fixes: Vec<&FixProposal> = lint
478 .diagnostics
479 .iter()
480 .filter(|d| d.rule.as_str() == "C001")
481 .filter_map(|d| d.fix.as_ref())
482 .filter(|f| f.source == FixSource::CorrectionsMap)
483 .filter(|f| f.confidence >= threshold)
484 .filter(|f| !f.span.is_empty())
485 .collect();
486
487 if text_fixes.is_empty() {
488 return (source.to_vec(), Vec::new());
489 }
490
491 // Sort and deduplicate using FR-016 order + C-1 overlap guard.
492 text_fixes.sort_by(|a, b| {
493 b.span
494 .end
495 .cmp(&a.span.end)
496 .then(b.span.start.cmp(&a.span.start))
497 .then(a.rule.cmp(&b.rule))
498 .then(a.replacement.cmp(&b.replacement))
499 });
500 let mut kept: Vec<&FixProposal> = Vec::new();
501 let mut next_end: Option<usize> = None;
502 for fix in &text_fixes {
503 let fits = next_end.is_none_or(|b| fix.span.end <= b);
504 if fits {
505 next_end = Some(fix.span.start);
506 kept.push(*fix);
507 }
508 }
509
510 let classifier_id: Option<Arc<str>> =
511 self.config.user.classifier_id.as_deref().map(Arc::from);
512 let dry_run = mode == FixMode::DryRun;
513 let now = self.clock.now();
514
515 // Always apply text corrections to the intermediate buffer, even in
516 // DryRun mode. This buffer is internal — pass 2 needs it to re-lint
517 // corrected text so downstream rules fire (e.g., E001 on NF after
518 // SERCET→SECRET). The final output for DryRun returns the original
519 // source in fix_inner, not this intermediate buffer.
520 let mut buf = source.to_vec();
521 let mut applied = Vec::with_capacity(kept.len());
522 for fix in &kept {
523 buf.splice(fix.span.start..fix.span.end, fix.replacement.bytes());
524 applied.push(AppliedFix::__engine_promote(
525 (*fix).clone(),
526 now,
527 classifier_id.clone(),
528 dry_run,
529 None,
530 ));
531 }
532
533 (buf, applied)
534 }
535}
536
537// ---------------------------------------------------------------------------
538// Tests
539// ---------------------------------------------------------------------------
540
541#[cfg(test)]
542mod tests {
543 use super::*;
544 use crate::clock::FixedClock;
545 use marque_ism::IsmAttributes;
546 use marque_rules::{
547 Diagnostic, FixProposal, FixSource, Rule, RuleContext, RuleId, RuleSet, Severity,
548 };
549 use std::time::{Duration, UNIX_EPOCH};
550
551 /// A test rule that emits a fixed list of FixProposals on every check call,
552 /// ignoring the parsed attributes. Lets us drive the engine deterministically
553 /// without depending on real CAPCO rule output.
554 struct StubRule {
555 id: &'static str,
556 proposals: Vec<FixProposal>,
557 }
558
559 impl Rule for StubRule {
560 fn id(&self) -> RuleId {
561 RuleId::new(self.id)
562 }
563 fn name(&self) -> &'static str {
564 "stub"
565 }
566 fn default_severity(&self) -> Severity {
567 Severity::Fix
568 }
569 fn check(&self, _attrs: &IsmAttributes, _ctx: &RuleContext) -> Vec<Diagnostic> {
570 self.proposals
571 .iter()
572 .map(|p| {
573 Diagnostic::new(
574 p.rule.clone(),
575 Severity::Fix,
576 p.span,
577 "stub",
578 "TEST",
579 Some(p.clone()),
580 )
581 })
582 .collect()
583 }
584 }
585
586 struct StubSet(Vec<Box<dyn Rule>>);
587 impl RuleSet for StubSet {
588 fn rules(&self) -> &[Box<dyn Rule>] {
589 &self.0
590 }
591 fn schema_version(&self) -> &'static str {
592 "TEST"
593 }
594 }
595
596 fn proposal(rule: &'static str, start: usize, end: usize, replacement: &str) -> FixProposal {
597 proposal_with_confidence(rule, start, end, replacement, 1.0)
598 }
599
600 fn proposal_with_confidence(
601 rule: &'static str,
602 start: usize,
603 end: usize,
604 replacement: &str,
605 confidence: f32,
606 ) -> FixProposal {
607 FixProposal::new(
608 RuleId::new(rule),
609 FixSource::BuiltinRule,
610 Span::new(start, end),
611 "x",
612 replacement,
613 confidence,
614 None,
615 )
616 }
617
618 fn engine_with(proposals: Vec<FixProposal>) -> Engine {
619 engine_with_config(Config::default(), proposals)
620 }
621
622 fn engine_with_config(config: Config, proposals: Vec<FixProposal>) -> Engine {
623 let stub = StubRule {
624 id: "TEST",
625 proposals,
626 };
627 let set: Box<dyn RuleSet> = Box::new(StubSet(vec![Box::new(stub)]));
628 Engine::with_clock(
629 config,
630 vec![set],
631 Box::new(FixedClock::new(
632 UNIX_EPOCH + Duration::from_secs(1_700_000_000),
633 )),
634 )
635 }
636
637 /// A source long enough to span the test fix offsets, AND containing a
638 /// banner marking so the parser produces a candidate that triggers
639 /// the rule loop in `Engine::lint`.
640 const TEST_SRC: &[u8] = b"SECRET//NOFORN ";
641
642 #[test]
643 fn fix_applies_disjoint_fixes_in_reverse_order() {
644 // Two non-overlapping fixes; FR-016 sorts by span.end DESC so the
645 // later one is applied first, preserving the earlier span's offsets.
646 let engine = engine_with(vec![
647 proposal("E001", 0, 6, "AA"), // "SECRET" → "AA"
648 proposal("E002", 8, 14, "BB"), // "NOFORN" → "BB"
649 ]);
650 let result = engine.fix(TEST_SRC, FixMode::Apply);
651 let out = String::from_utf8(result.source).unwrap();
652 assert!(out.starts_with("AA//BB"), "got: {out:?}");
653 assert_eq!(result.applied.len(), 2);
654 }
655
656 #[test]
657 fn overlap_guard_drops_overlapping_fix() {
658 // Two fixes whose spans collide. C-1: keep one, drop the other.
659 let engine = engine_with(vec![
660 proposal("E001", 0, 6, "AA"),
661 proposal("E002", 3, 10, "BB"), // overlaps E001
662 ]);
663 let result = engine.fix(TEST_SRC, FixMode::Apply);
664 // Exactly one fix should be applied, the other should remain in
665 // `remaining_diagnostics` so callers can see it was not silently
666 // dropped.
667 assert_eq!(result.applied.len(), 1, "applied: {:?}", result.applied);
668 assert_eq!(
669 result.remaining_diagnostics.len(),
670 1,
671 "remaining: {:?}",
672 result.remaining_diagnostics
673 );
674 }
675
676 #[test]
677 fn dry_run_returns_original_source_but_records_applied() {
678 let engine = engine_with(vec![proposal("E001", 0, 6, "AA")]);
679 let result = engine.fix(TEST_SRC, FixMode::DryRun);
680 assert_eq!(result.source, TEST_SRC, "dry-run must not mutate source");
681 assert_eq!(result.applied.len(), 1);
682 assert!(result.applied[0].dry_run, "dry_run flag must be set");
683 }
684
685 #[test]
686 fn fix_with_threshold_rejects_nan() {
687 let engine = engine_with(vec![]);
688 assert!(matches!(
689 engine.fix_with_threshold(TEST_SRC, FixMode::Apply, Some(f32::NAN)),
690 Err(InvalidThreshold(_))
691 ));
692 }
693
694 #[test]
695 fn fix_with_threshold_rejects_out_of_range() {
696 let engine = engine_with(vec![]);
697 assert!(matches!(
698 engine.fix_with_threshold(TEST_SRC, FixMode::Apply, Some(-0.1)),
699 Err(InvalidThreshold(_))
700 ));
701 assert!(matches!(
702 engine.fix_with_threshold(TEST_SRC, FixMode::Apply, Some(1.1)),
703 Err(InvalidThreshold(_))
704 ));
705 }
706
707 #[test]
708 fn fix_with_threshold_accepts_boundaries() {
709 let engine = engine_with(vec![]);
710 assert!(
711 engine
712 .fix_with_threshold(TEST_SRC, FixMode::Apply, Some(0.0))
713 .is_ok()
714 );
715 assert!(
716 engine
717 .fix_with_threshold(TEST_SRC, FixMode::Apply, Some(1.0))
718 .is_ok()
719 );
720 }
721
722 #[test]
723 fn fixed_clock_yields_deterministic_timestamps() {
724 let engine = engine_with(vec![proposal("E001", 0, 6, "AA")]);
725 let r1 = engine.fix(TEST_SRC, FixMode::Apply);
726 let r2 = engine.fix(TEST_SRC, FixMode::Apply);
727 assert_eq!(r1.applied[0].timestamp, r2.applied[0].timestamp);
728 }
729
730 // H-3: fix_with_threshold must reject non-finite overrides in all
731 // directions, not just NaN. INFINITY and NEG_INFINITY are both caught
732 // by the range check; this test pins that behavior so a future refactor
733 // that uses e.g. `is_finite` instead of `contains + is_nan` cannot
734 // silently regress.
735 #[test]
736 fn fix_with_threshold_rejects_infinity() {
737 let engine = engine_with(vec![]);
738 assert!(matches!(
739 engine.fix_with_threshold(TEST_SRC, FixMode::Apply, Some(f32::INFINITY)),
740 Err(InvalidThreshold(_))
741 ));
742 assert!(matches!(
743 engine.fix_with_threshold(TEST_SRC, FixMode::Apply, Some(f32::NEG_INFINITY)),
744 Err(InvalidThreshold(_))
745 ));
746 }
747
748 // M-4: the confidence filter at `f.confidence >= threshold` is on the
749 // hot path of Engine::fix. These two tests pin the `>=` semantics so a
750 // future refactor that flips it to `>` (or vice versa) is caught.
751 #[test]
752 fn confidence_below_default_threshold_is_excluded() {
753 // Config::default().confidence_threshold == 0.95. A fix at 0.94
754 // must not be applied.
755 let engine = engine_with(vec![proposal_with_confidence("E001", 0, 6, "AA", 0.94)]);
756 let result = engine.fix(TEST_SRC, FixMode::Apply);
757 assert_eq!(result.applied.len(), 0);
758 // The below-threshold fix is a suggestion — it survives in
759 // remaining_diagnostics so the caller can surface it.
760 assert_eq!(result.remaining_diagnostics.len(), 1);
761 }
762
763 #[test]
764 fn confidence_at_default_threshold_is_included() {
765 // A fix at exactly 0.95 must be applied (inclusive threshold).
766 let engine = engine_with(vec![proposal_with_confidence("E001", 0, 6, "AA", 0.95)]);
767 let result = engine.fix(TEST_SRC, FixMode::Apply);
768 assert_eq!(result.applied.len(), 1);
769 }
770
771 // M-5: the zero-length-span filter (`!f.span.is_empty()`) in fix_inner
772 // is what masked the Phase 2 Span::new(0, 0) placeholders from the
773 // C-1 overlap guard. This test pins that guard explicitly so a future
774 // refactor that drops the filter is caught.
775 #[test]
776 fn zero_length_span_fix_is_filtered_before_sort() {
777 let engine = engine_with(vec![proposal("E001", 5, 5, "X")]);
778 let result = engine.fix(TEST_SRC, FixMode::Apply);
779 assert_eq!(result.applied.len(), 0);
780 // Source unchanged: no splice was attempted.
781 assert_eq!(result.source, TEST_SRC);
782 }
783
784 // L-4: all the other threshold tests go through fix_with_threshold
785 // (override path). This exercises the Config-supplied path explicitly
786 // so both branches of `fix_with_threshold_inner`'s threshold selection
787 // are covered.
788 #[test]
789 fn config_supplied_threshold_filters_proposals() {
790 let mut config = Config::default();
791 config.set_confidence_threshold(0.5).unwrap();
792 let engine = engine_with_config(
793 config,
794 vec![
795 proposal_with_confidence("E001", 0, 6, "AA", 0.4), // below
796 proposal_with_confidence("E002", 8, 14, "BB", 0.6), // above
797 ],
798 );
799 let result = engine.fix(TEST_SRC, FixMode::Apply);
800 // Only the 0.6 fix is applied.
801 assert_eq!(result.applied.len(), 1);
802 assert_eq!(result.applied[0].proposal.rule.as_str(), "E002");
803 // The 0.4 fix surfaces as a remaining diagnostic.
804 assert_eq!(result.remaining_diagnostics.len(), 1);
805 }
806
807 // Phase 3 Task 2: PageBreak candidates must reset the engine's
808 // PageContext accumulator. Without this, banner-validation rules on
809 // the second page would see portions from the first page, producing
810 // over-restrictive expected aggregates.
811 #[test]
812 fn lint_handles_multi_page_document_with_form_feed() {
813 let src: &[u8] = b"(SECRET//NOFORN) page 1 body.\nSECRET//NOFORN\n\x0c(CONFIDENTIAL) page 2 body.\nCONFIDENTIAL\n";
814 let engine = engine_with(vec![]);
815 let result = engine.lint(src);
816 // Stub rule with no proposals: clean lint, no panic, no parser
817 // error from the page-break candidate (which is filtered before
818 // parser.parse is called).
819 assert!(result.is_clean());
820 }
821
822 // F.1: PageContext reset semantics are observable.
823 //
824 // ContextRecorderRule captures the live `page_context.portion_count()`
825 // every time it's invoked. By running the engine over a multi-page
826 // document and inspecting the captured counts at each banner candidate,
827 // we prove that the engine resets PageContext at the page break instead
828 // of accumulating across pages.
829 #[derive(Clone)]
830 struct ContextRecorderRule {
831 observations: std::sync::Arc<std::sync::Mutex<Vec<(marque_ism::MarkingType, usize)>>>,
832 }
833
834 impl Rule for ContextRecorderRule {
835 fn id(&self) -> RuleId {
836 RuleId::new("RECORD")
837 }
838 fn name(&self) -> &'static str {
839 "page-context-recorder"
840 }
841 fn default_severity(&self) -> Severity {
842 Severity::Warn
843 }
844 fn check(&self, _attrs: &IsmAttributes, ctx: &RuleContext) -> Vec<Diagnostic> {
845 let count = ctx
846 .page_context
847 .as_ref()
848 .map(|pc| pc.portion_count())
849 .unwrap_or(0);
850 self.observations
851 .lock()
852 .unwrap()
853 .push((ctx.marking_type, count));
854 vec![]
855 }
856 }
857
858 struct RecorderSet(Vec<Box<dyn Rule>>);
859 impl RuleSet for RecorderSet {
860 fn rules(&self) -> &[Box<dyn Rule>] {
861 &self.0
862 }
863 fn schema_version(&self) -> &'static str {
864 "TEST"
865 }
866 }
867
868 #[test]
869 fn page_context_resets_observably_across_form_feed() {
870 use marque_ism::MarkingType;
871 let observations = std::sync::Arc::new(std::sync::Mutex::new(Vec::new()));
872 let rule = ContextRecorderRule {
873 observations: std::sync::Arc::clone(&observations),
874 };
875 let set: Box<dyn RuleSet> = Box::new(RecorderSet(vec![Box::new(rule)]));
876 let engine = Engine::with_clock(
877 Config::default(),
878 vec![set],
879 Box::new(FixedClock::new(
880 UNIX_EPOCH + Duration::from_secs(1_700_000_000),
881 )),
882 );
883
884 // Two pages, separated by a form feed:
885 // Page 1: one portion + one banner
886 // Page break (\f)
887 // Page 2: one portion + one banner
888 //
889 // The recorder fires on every candidate that reaches the rule loop.
890 // For the page-1 banner we expect to see 1 accumulated portion.
891 // For the page-2 banner we expect to see 1 accumulated portion
892 // (NOT 2) — the form feed must have reset the context.
893 let src: &[u8] = b"(SECRET//NF) p1 text\nSECRET//NOFORN\n\x0c(CONFIDENTIAL//NF) p2\nCONFIDENTIAL//NOFORN\n";
894 let _ = engine.lint(src);
895
896 let obs = observations.lock().unwrap();
897 // The recorder ran once per non-PageBreak candidate. Filter to
898 // banners and check the page_context count each banner saw.
899 let banner_counts: Vec<usize> = obs
900 .iter()
901 .filter(|(kind, _)| *kind == MarkingType::Banner)
902 .map(|(_, count)| *count)
903 .collect();
904 assert_eq!(
905 banner_counts.len(),
906 2,
907 "expected 2 banner observations, got: {obs:?}"
908 );
909 assert_eq!(
910 banner_counts[0], 1,
911 "page-1 banner should see 1 accumulated portion"
912 );
913 assert_eq!(
914 banner_counts[1], 1,
915 "page-2 banner should see 1 accumulated portion (the page-1 \
916 portion must be cleared by the form feed)"
917 );
918 }
919
920 #[test]
921 fn page_context_lint_starts_fresh_on_each_call() {
922 // Calling Engine::lint twice on the same engine must produce a
923 // fresh PageContext for the second call — no cross-call accumulation.
924 use marque_ism::MarkingType;
925 let observations = std::sync::Arc::new(std::sync::Mutex::new(Vec::new()));
926 let rule = ContextRecorderRule {
927 observations: std::sync::Arc::clone(&observations),
928 };
929 let set: Box<dyn RuleSet> = Box::new(RecorderSet(vec![Box::new(rule)]));
930 let engine = Engine::with_clock(
931 Config::default(),
932 vec![set],
933 Box::new(FixedClock::new(
934 UNIX_EPOCH + Duration::from_secs(1_700_000_000),
935 )),
936 );
937 let src: &[u8] = b"(SECRET//NF) text\nSECRET//NOFORN\n";
938 let _ = engine.lint(src);
939 let _ = engine.lint(src);
940
941 let obs = observations.lock().unwrap();
942 // Both calls should see identical observations — if the second
943 // call leaked state from the first, the page-2 banner_count would
944 // double.
945 let banner_counts: Vec<usize> = obs
946 .iter()
947 .filter(|(kind, _)| *kind == MarkingType::Banner)
948 .map(|(_, count)| *count)
949 .collect();
950 assert_eq!(
951 banner_counts.len(),
952 2,
953 "two lint calls should produce two banner observations"
954 );
955 assert_eq!(banner_counts, vec![1, 1]);
956 }
957
958 // M6: FR-016 tiebreaker — same span, different rule IDs.
959 // The sort is (span.end DESC, span.start DESC, rule_id ASC, replacement ASC).
960 // When two fixes target the exact same span, rule_id ASC breaks the tie,
961 // and C-1 drops the second (overlapping) fix.
962 #[test]
963 fn fr016_same_span_different_rule_ids_picks_lower_rule_id() {
964 // Two proposals for span 0..6 with different rule IDs.
965 // "C001" < "E001" lexicographically, so C001 is kept and E001 dropped.
966 let engine = engine_with(vec![
967 proposal("E001", 0, 6, "BB"),
968 proposal("C001", 0, 6, "AA"),
969 ]);
970 let result = engine.fix(TEST_SRC, FixMode::Apply);
971 assert_eq!(result.applied.len(), 1);
972 assert_eq!(result.applied[0].proposal.rule.as_str(), "C001");
973 assert_eq!(result.applied[0].proposal.replacement.as_ref(), "AA");
974 }
975
976 // FR-016 tiebreaker — same span, same rule ID, different replacements.
977 #[test]
978 fn fr016_same_span_same_rule_picks_lower_replacement() {
979 let engine = engine_with(vec![
980 proposal("E001", 0, 6, "ZZZ"),
981 proposal("E001", 0, 6, "AAA"),
982 ]);
983 let result = engine.fix(TEST_SRC, FixMode::Apply);
984 assert_eq!(result.applied.len(), 1);
985 assert_eq!(result.applied[0].proposal.replacement.as_ref(), "AAA");
986 }
987}