marque_engine/
engine.rs

1//! `Engine` — the configured, ready-to-run pipeline.
2
3use crate::clock::{Clock, SystemClock};
4use crate::output::{FixResult, LintResult};
5use aho_corasick::AhoCorasick;
6use marque_config::Config;
7use marque_ism::Span;
8use marque_rules::{AppliedFix, Diagnostic, FixProposal, FixSource, RuleId, RuleSet, Severity};
9use std::collections::HashMap;
10use std::sync::Arc;
11
12/// Whether to apply fixes or just simulate (dry-run).
13#[derive(Debug, Clone, Copy, PartialEq, Eq)]
14pub enum FixMode {
15    /// Apply fixes to the source text.
16    Apply,
17    /// Simulate fixes — audit stream is identical but source is unchanged.
18    DryRun,
19}
20
21/// Error returned when a caller supplies a runtime confidence threshold
22/// override that is outside the valid `[0.0, 1.0]` range.
23#[derive(Debug, Clone, Copy, PartialEq)]
24pub struct InvalidThreshold(pub f32);
25
26impl std::fmt::Display for InvalidThreshold {
27    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
28        write!(
29            f,
30            "confidence threshold {} is outside [0.0, 1.0] or is NaN",
31            self.0
32        )
33    }
34}
35
36impl std::error::Error for InvalidThreshold {}
37
38/// A configured engine instance.
39pub struct Engine {
40    config: Config,
41    rule_sets: Vec<Box<dyn RuleSet>>,
42    clock: Box<dyn Clock>,
43    /// Corrections map wrapped in Arc once at construction time so that each
44    /// `RuleContext` clone in `lint()` is an O(1) refcount bump, not a
45    /// deep-clone of the entire HashMap.
46    corrections_arc: Option<Arc<HashMap<String, String>>>,
47    /// Pre-built Aho-Corasick automaton for pre-scanner text corrections.
48    /// Built once at construction time from the corrections map (excluding
49    /// no-op and "//" entries). `None` when the corrections map is empty or
50    /// all entries are filtered out.
51    corrections_ac: Option<CachedAhoCorasick>,
52}
53
54/// Cached AhoCorasick automaton + the active (key, value) pairs that
55/// correspond to its pattern indices.
56struct CachedAhoCorasick {
57    ac: AhoCorasick,
58    /// Active correction pairs, indexed by `PatternID::as_usize()`.
59    active: Vec<(Box<str>, Box<str>)>,
60}
61
62impl Engine {
63    /// Create a new engine with the given configuration and rule sets.
64    pub fn new(config: Config, rule_sets: Vec<Box<dyn RuleSet>>) -> Self {
65        Self::with_clock(config, rule_sets, Box::new(SystemClock))
66    }
67
68    /// Create an engine with a custom clock (for deterministic tests).
69    pub fn with_clock(
70        mut config: Config,
71        rule_sets: Vec<Box<dyn RuleSet>>,
72        clock: Box<dyn Clock>,
73    ) -> Self {
74        // Take ownership of the corrections map instead of cloning —
75        // nothing reads config.corrections after construction.
76        let corrections_arc = if config.corrections.is_empty() {
77            None
78        } else {
79            Some(Arc::new(std::mem::take(&mut config.corrections)))
80        };
81
82        // Pre-build the AhoCorasick automaton for pre-scanner text corrections.
83        // This is O(total pattern bytes) and done once, not per-lint call.
84        let corrections_ac = corrections_arc.as_ref().and_then(|corrections| {
85            // Sort by key for deterministic pattern ordering — HashMap
86            // iteration order is random (hash seed varies per process),
87            // and AhoCorasick pattern IDs depend on insertion order.
88            let mut active: Vec<(Box<str>, Box<str>)> = corrections
89                .iter()
90                .filter(|(k, v)| k != v && k.as_str() != "//")
91                .map(|(k, v)| (k.as_str().into(), v.as_str().into()))
92                .collect();
93            active.sort_by(|(a, _), (b, _)| a.cmp(b));
94            if active.is_empty() {
95                return None;
96            }
97            let patterns: Vec<&str> = active.iter().map(|(k, _)| k.as_ref()).collect();
98            match AhoCorasick::new(&patterns) {
99                Ok(ac) => Some(CachedAhoCorasick { ac, active }),
100                Err(e) => {
101                    tracing::warn!(
102                        "failed to build AhoCorasick automaton for corrections map \
103                         ({} patterns): {e}; pre-scanner text corrections disabled",
104                        patterns.len()
105                    );
106                    None
107                }
108            }
109        });
110
111        Self {
112            config,
113            rule_sets,
114            clock,
115            corrections_arc,
116            corrections_ac,
117        }
118    }
119
120    /// Lint a UTF-8 text buffer. Returns diagnostics without modifying input.
121    pub fn lint(&self, source: &[u8]) -> LintResult {
122        use marque_core::{Parser, Scanner};
123        use marque_ism::{CapcoTokenSet, MarkingType, PageContext};
124        use marque_rules::RuleContext;
125
126        let token_set = CapcoTokenSet;
127        let parser = Parser::new(&token_set);
128        let candidates = Scanner::scan(source);
129
130        // corrections_arc was built once at Engine construction; each clone here
131        // is an O(1) refcount bump.
132        let corrections_arc = self.corrections_arc.clone();
133
134        let mut diagnostics = Vec::new();
135        // Build page context by accumulating portion markings in document order.
136        // Banner and CAB rules receive this context so they can validate the
137        // observed banner against the expected composite. Phase 3 wires the
138        // page-break reset below — the scanner emits a `MarkingType::PageBreak`
139        // candidate at every form-feed and at every `\n\n\n+` run; on each
140        // such candidate we drop the accumulator and start a fresh page.
141        let mut page_context = PageContext::new();
142        // Cache the current Arc<PageContext> so that consecutive banner/CAB
143        // candidates on the same page share a single allocation. The cache is
144        // invalidated (set to None) whenever a new portion is accumulated or
145        // a page break resets the context.
146        let mut page_context_arc: Option<Arc<PageContext>> = None;
147
148        for candidate in &candidates {
149            // Page-break candidates are scanner-emitted boundaries with no
150            // parsable content. Reset the context BEFORE attempting to parse
151            // — otherwise the parser's MalformedMarking error would skip the
152            // continue and leave us accumulating across pages.
153            if candidate.kind == MarkingType::PageBreak {
154                page_context = PageContext::new();
155                page_context_arc = None;
156                continue;
157            }
158
159            let Ok(parsed) = parser.parse(candidate, source) else {
160                continue;
161            };
162
163            // Accumulate portions before running banner/CAB rules so that
164            // when we reach a banner candidate the context already reflects
165            // all preceding portion data.
166            if parsed.kind == MarkingType::Portion {
167                page_context.add_portion(parsed.attrs.clone());
168                // Invalidate the cached Arc so the next banner/CAB gets a
169                // fresh snapshot. We rebuild it lazily below.
170                page_context_arc = None;
171            }
172
173            // Phase 3: zone and position are Option-typed and stay None
174            // until a structural scanner pass can prove them. The previous
175            // hardcoded `Zone::Body`/`DocumentPosition::Body` was a silent
176            // lie to any future rule that read them.
177            let ctx_page = if parsed.kind != MarkingType::Portion && !page_context.is_empty() {
178                // Lazily wrap the accumulated context in an Arc once per
179                // page-context snapshot; subsequent banner/CAB candidates on
180                // the same page clone only the cheap Arc pointer.
181                Some(
182                    page_context_arc
183                        .get_or_insert_with(|| Arc::new(page_context.clone()))
184                        .clone(),
185                )
186            } else {
187                None
188            };
189            let ctx = RuleContext {
190                marking_type: candidate.kind,
191                zone: None,
192                position: None,
193                page_context: ctx_page,
194                corrections: corrections_arc.clone(),
195            };
196            for rule_set in &self.rule_sets {
197                for rule in rule_set.rules() {
198                    // Skip rules that are configured as Off.
199                    let configured_severity = self
200                        .config
201                        .rules
202                        .overrides
203                        .get(rule.id().as_str())
204                        .and_then(|s| Severity::parse_config(s))
205                        .unwrap_or(rule.default_severity());
206
207                    if configured_severity == Severity::Off {
208                        continue;
209                    }
210
211                    let mut diags = rule.check(&parsed.attrs, &ctx);
212                    // Apply configured severity override.
213                    for d in &mut diags {
214                        d.severity = configured_severity;
215                    }
216                    diagnostics.extend(diags);
217                }
218            }
219        }
220
221        // Pre-scanner text corrections: scan the raw source for
222        // corrections-map keys that the scanner missed (e.g., "SERCET" is
223        // not a known classification prefix, so the scanner never detects
224        // "SERCET//NF" as a candidate, and C001 never sees the token).
225        //
226        // This pass emits C001 diagnostics for raw-text matches that don't
227        // overlap with any C001 diagnostic already produced by the rule
228        // pipeline above. Spans reference the original source buffer.
229        if let Some(cached) = &self.corrections_ac {
230            let c001_severity = self
231                .config
232                .rules
233                .overrides
234                .get("C001")
235                .and_then(|s| Severity::parse_config(s))
236                .unwrap_or(Severity::Fix);
237
238            if c001_severity != Severity::Off {
239                // Collect spans already covered by rule-pipeline C001.
240                let existing_c001_spans: std::collections::HashSet<Span> = diagnostics
241                    .iter()
242                    .filter(|d| d.rule.as_str() == "C001")
243                    .map(|d| d.span)
244                    .collect();
245
246                // Use the pre-built AhoCorasick automaton to scan the full
247                // source in a single O(n + m) pass. The automaton and its
248                // active pairs were built once at Engine construction time.
249                for mat in cached.ac.find_iter(source) {
250                    let span = Span::new(mat.start(), mat.end());
251                    let (ref key, ref value) = cached.active[mat.pattern().as_usize()];
252
253                    // Skip if the rule pipeline already produced a C001
254                    // diagnostic for this exact span.
255                    if !existing_c001_spans.contains(&span) {
256                        let proposal = FixProposal::new(
257                            RuleId::new("C001"),
258                            FixSource::CorrectionsMap,
259                            span,
260                            key.as_ref(),
261                            value.as_ref(),
262                            1.0,
263                            None,
264                        );
265                        diagnostics.push(Diagnostic::new(
266                            RuleId::new("C001"),
267                            c001_severity,
268                            span,
269                            format!("corrections map: {key:?} → {value:?}"),
270                            "CONFIG:[corrections]",
271                            Some(proposal),
272                        ));
273                    }
274                }
275            }
276        }
277
278        LintResult { diagnostics }
279    }
280
281    /// Lint and apply fixes. Returns fixed source and audit log.
282    ///
283    /// Fix application order follows FR-016: `(span.end DESC, span.start DESC,
284    /// rule_id ASC, replacement ASC)` so reverse-byte application preserves
285    /// earlier-span offsets and equal-span ties break deterministically.
286    ///
287    /// Uses the confidence threshold configured in the engine's `Config`.
288    /// To supply a per-call override (e.g., from a `--confidence` CLI flag
289    /// or an HTTP request field), use [`Engine::fix_with_threshold`].
290    pub fn fix(&self, source: &[u8], mode: FixMode) -> FixResult {
291        // The config threshold is pre-validated at load time, so the
292        // `Result` branch is unreachable.
293        self.fix_with_threshold(source, mode, None)
294            .expect("config-supplied confidence threshold is pre-validated")
295    }
296
297    /// Lint and apply fixes using an optional per-call confidence threshold.
298    ///
299    /// When `threshold_override` is `Some`, it replaces the config-level
300    /// threshold for this call only and is validated against `[0.0, 1.0]`.
301    /// When `None`, the engine falls back to `Config::confidence_threshold`.
302    pub fn fix_with_threshold(
303        &self,
304        source: &[u8],
305        mode: FixMode,
306        threshold_override: Option<f32>,
307    ) -> Result<FixResult, InvalidThreshold> {
308        let threshold = match threshold_override {
309            Some(value) => {
310                if !(0.0..=1.0).contains(&value) || value.is_nan() {
311                    return Err(InvalidThreshold(value));
312                }
313                value
314            }
315            None => self.config.confidence_threshold(),
316        };
317
318        Ok(self.fix_inner(source, mode, threshold))
319    }
320
321    fn fix_inner(&self, source: &[u8], mode: FixMode, threshold: f32) -> FixResult {
322        use std::collections::HashSet;
323
324        // Two-pass fix strategy for pre-scanner text corrections.
325        //
326        // Pass 1: lint the original source. The pre-scanner text scan may
327        // produce C001 diagnostics for corrections-map matches the scanner
328        // missed (e.g., "SERCET" is not a known classification prefix).
329        // Apply those C001 fixes to produce an intermediate source.
330        //
331        // Pass 2: re-lint the intermediate source. The scanner now detects
332        // the corrected marking (e.g., "SECRET//NF") and additional rules
333        // fire (e.g., E001 on NF→NOFORN). Apply those fixes on top.
334        //
335        // Without this, the spec scenario "SERCET//NF → SECRET//NOFORN"
336        // would stop at "SECRET//NF".
337        let lint1 = self.lint(source);
338        let (effective_source, pass1_applied) =
339            self.apply_text_corrections(source, &lint1, threshold, mode);
340
341        let lint = if !pass1_applied.is_empty() {
342            // Re-lint the corrected source so the scanner picks up newly-valid markings.
343            self.lint(&effective_source)
344        } else {
345            lint1
346        };
347
348        let mut fixes: Vec<_> = lint
349            .diagnostics
350            .iter()
351            .filter_map(|d| d.fix.as_ref())
352            .filter(|f| f.confidence >= threshold)
353            .filter(|f| !f.span.is_empty())
354            .collect();
355
356        // FR-016: deterministic total-order fix application.
357        // Sort by (span.end DESC, span.start DESC, rule_id ASC, replacement ASC).
358        fixes.sort_by(|a, b| {
359            b.span
360                .end
361                .cmp(&a.span.end)
362                .then(b.span.start.cmp(&a.span.start))
363                .then(a.rule.cmp(&b.rule))
364                .then(a.replacement.cmp(&b.replacement))
365        });
366
367        // C-1: overlap guard. After the FR-016 sort, two fixes can still
368        // touch the same byte range if multiple rules emit a fix for the
369        // same span (or overlapping spans). Applying both via `splice`
370        // would silently corrupt the byte stream. We keep the first fix
371        // per span (which under FR-016 ordering is deterministic) and
372        // surface the dropped fixes through `remaining_diagnostics`.
373        //
374        // The walk is over fixes in reverse-end order, so a fix is kept
375        // only if its `span.end` is at or below the previous kept fix's
376        // `span.start` — i.e., strictly to the left, no overlap.
377        let mut kept_fixes: Vec<&marque_rules::FixProposal> = Vec::with_capacity(fixes.len());
378        let mut next_window_end: Option<usize> = None;
379        for fix in &fixes {
380            let fits = match next_window_end {
381                Some(boundary) => fix.span.end <= boundary,
382                None => true,
383            };
384            if fits {
385                next_window_end = Some(fix.span.start);
386                kept_fixes.push(*fix);
387            }
388        }
389
390        // M-4: hold the classifier id in an `Arc<str>` so cloning into each
391        // applied-fix audit record is an O(1) refcount bump rather than a
392        // full string copy per fix.
393        let classifier_id: Option<std::sync::Arc<str>> = self
394            .config
395            .user
396            .classifier_id
397            .as_deref()
398            .map(std::sync::Arc::from);
399        let dry_run = mode == FixMode::DryRun;
400        let now = self.clock.now();
401
402        // H-7: applied-fix lookup is keyed by (RuleId, Span). Use a HashSet
403        // so the per-diagnostic filter at the bottom of this function is
404        // O(1) per query instead of O(n) over a Vec.
405        let mut applied_keys: HashSet<(RuleId, Span)> = HashSet::with_capacity(kept_fixes.len());
406        let mut applied: Vec<AppliedFix> = Vec::with_capacity(kept_fixes.len());
407
408        // Only allocate the output buffer when we actually need to mutate it.
409        // Dry-run returns the original source verbatim.
410        let output = match mode {
411            FixMode::Apply => {
412                let mut buf = effective_source.clone();
413                for fix in kept_fixes {
414                    buf.splice(fix.span.start..fix.span.end, fix.replacement.bytes());
415                    applied_keys.insert((fix.rule.clone(), fix.span));
416                    applied.push(AppliedFix::__engine_promote(
417                        fix.clone(),
418                        now,
419                        classifier_id.clone(),
420                        dry_run,
421                        None, // input identifier set by CLI at the boundary
422                    ));
423                }
424                buf
425            }
426            FixMode::DryRun => {
427                for fix in kept_fixes {
428                    applied_keys.insert((fix.rule.clone(), fix.span));
429                    applied.push(AppliedFix::__engine_promote(
430                        fix.clone(),
431                        now,
432                        classifier_id.clone(),
433                        dry_run,
434                        None,
435                    ));
436                }
437                source.to_vec()
438            }
439        };
440
441        // Prepend pass-1 text corrections to the applied list so they
442        // appear in the audit trail.
443        let mut all_applied = pass1_applied;
444        all_applied.extend(applied);
445
446        // Remaining diagnostics: those whose fix was not applied.
447        // Filter by (rule_id, span) pair — not just rule ID — so that if
448        // rule E001 fires on three spans and only one is fixed, the other
449        // two remain.
450        let remaining_diagnostics = lint
451            .diagnostics
452            .into_iter()
453            .filter(|d| {
454                !d.fix
455                    .as_ref()
456                    .is_some_and(|f| applied_keys.contains(&(f.rule.clone(), f.span)))
457            })
458            .collect();
459
460        FixResult {
461            source: output,
462            applied: all_applied,
463            remaining_diagnostics,
464        }
465    }
466
467    /// Apply pre-scanner text corrections (C001) from lint diagnostics and
468    /// return the corrected source + applied fixes. Used by `fix_inner` to
469    /// produce an intermediate source that the scanner can detect.
470    fn apply_text_corrections(
471        &self,
472        source: &[u8],
473        lint: &LintResult,
474        threshold: f32,
475        mode: FixMode,
476    ) -> (Vec<u8>, Vec<AppliedFix>) {
477        let mut text_fixes: Vec<&FixProposal> = lint
478            .diagnostics
479            .iter()
480            .filter(|d| d.rule.as_str() == "C001")
481            .filter_map(|d| d.fix.as_ref())
482            .filter(|f| f.source == FixSource::CorrectionsMap)
483            .filter(|f| f.confidence >= threshold)
484            .filter(|f| !f.span.is_empty())
485            .collect();
486
487        if text_fixes.is_empty() {
488            return (source.to_vec(), Vec::new());
489        }
490
491        // Sort and deduplicate using FR-016 order + C-1 overlap guard.
492        text_fixes.sort_by(|a, b| {
493            b.span
494                .end
495                .cmp(&a.span.end)
496                .then(b.span.start.cmp(&a.span.start))
497                .then(a.rule.cmp(&b.rule))
498                .then(a.replacement.cmp(&b.replacement))
499        });
500        let mut kept: Vec<&FixProposal> = Vec::new();
501        let mut next_end: Option<usize> = None;
502        for fix in &text_fixes {
503            let fits = next_end.is_none_or(|b| fix.span.end <= b);
504            if fits {
505                next_end = Some(fix.span.start);
506                kept.push(*fix);
507            }
508        }
509
510        let classifier_id: Option<Arc<str>> =
511            self.config.user.classifier_id.as_deref().map(Arc::from);
512        let dry_run = mode == FixMode::DryRun;
513        let now = self.clock.now();
514
515        // Always apply text corrections to the intermediate buffer, even in
516        // DryRun mode. This buffer is internal — pass 2 needs it to re-lint
517        // corrected text so downstream rules fire (e.g., E001 on NF after
518        // SERCET→SECRET). The final output for DryRun returns the original
519        // source in fix_inner, not this intermediate buffer.
520        let mut buf = source.to_vec();
521        let mut applied = Vec::with_capacity(kept.len());
522        for fix in &kept {
523            buf.splice(fix.span.start..fix.span.end, fix.replacement.bytes());
524            applied.push(AppliedFix::__engine_promote(
525                (*fix).clone(),
526                now,
527                classifier_id.clone(),
528                dry_run,
529                None,
530            ));
531        }
532
533        (buf, applied)
534    }
535}
536
537// ---------------------------------------------------------------------------
538// Tests
539// ---------------------------------------------------------------------------
540
541#[cfg(test)]
542mod tests {
543    use super::*;
544    use crate::clock::FixedClock;
545    use marque_ism::IsmAttributes;
546    use marque_rules::{
547        Diagnostic, FixProposal, FixSource, Rule, RuleContext, RuleId, RuleSet, Severity,
548    };
549    use std::time::{Duration, UNIX_EPOCH};
550
551    /// A test rule that emits a fixed list of FixProposals on every check call,
552    /// ignoring the parsed attributes. Lets us drive the engine deterministically
553    /// without depending on real CAPCO rule output.
554    struct StubRule {
555        id: &'static str,
556        proposals: Vec<FixProposal>,
557    }
558
559    impl Rule for StubRule {
560        fn id(&self) -> RuleId {
561            RuleId::new(self.id)
562        }
563        fn name(&self) -> &'static str {
564            "stub"
565        }
566        fn default_severity(&self) -> Severity {
567            Severity::Fix
568        }
569        fn check(&self, _attrs: &IsmAttributes, _ctx: &RuleContext) -> Vec<Diagnostic> {
570            self.proposals
571                .iter()
572                .map(|p| {
573                    Diagnostic::new(
574                        p.rule.clone(),
575                        Severity::Fix,
576                        p.span,
577                        "stub",
578                        "TEST",
579                        Some(p.clone()),
580                    )
581                })
582                .collect()
583        }
584    }
585
586    struct StubSet(Vec<Box<dyn Rule>>);
587    impl RuleSet for StubSet {
588        fn rules(&self) -> &[Box<dyn Rule>] {
589            &self.0
590        }
591        fn schema_version(&self) -> &'static str {
592            "TEST"
593        }
594    }
595
596    fn proposal(rule: &'static str, start: usize, end: usize, replacement: &str) -> FixProposal {
597        proposal_with_confidence(rule, start, end, replacement, 1.0)
598    }
599
600    fn proposal_with_confidence(
601        rule: &'static str,
602        start: usize,
603        end: usize,
604        replacement: &str,
605        confidence: f32,
606    ) -> FixProposal {
607        FixProposal::new(
608            RuleId::new(rule),
609            FixSource::BuiltinRule,
610            Span::new(start, end),
611            "x",
612            replacement,
613            confidence,
614            None,
615        )
616    }
617
618    fn engine_with(proposals: Vec<FixProposal>) -> Engine {
619        engine_with_config(Config::default(), proposals)
620    }
621
622    fn engine_with_config(config: Config, proposals: Vec<FixProposal>) -> Engine {
623        let stub = StubRule {
624            id: "TEST",
625            proposals,
626        };
627        let set: Box<dyn RuleSet> = Box::new(StubSet(vec![Box::new(stub)]));
628        Engine::with_clock(
629            config,
630            vec![set],
631            Box::new(FixedClock::new(
632                UNIX_EPOCH + Duration::from_secs(1_700_000_000),
633            )),
634        )
635    }
636
637    /// A source long enough to span the test fix offsets, AND containing a
638    /// banner marking so the parser produces a candidate that triggers
639    /// the rule loop in `Engine::lint`.
640    const TEST_SRC: &[u8] = b"SECRET//NOFORN                                                ";
641
642    #[test]
643    fn fix_applies_disjoint_fixes_in_reverse_order() {
644        // Two non-overlapping fixes; FR-016 sorts by span.end DESC so the
645        // later one is applied first, preserving the earlier span's offsets.
646        let engine = engine_with(vec![
647            proposal("E001", 0, 6, "AA"),  // "SECRET" → "AA"
648            proposal("E002", 8, 14, "BB"), // "NOFORN" → "BB"
649        ]);
650        let result = engine.fix(TEST_SRC, FixMode::Apply);
651        let out = String::from_utf8(result.source).unwrap();
652        assert!(out.starts_with("AA//BB"), "got: {out:?}");
653        assert_eq!(result.applied.len(), 2);
654    }
655
656    #[test]
657    fn overlap_guard_drops_overlapping_fix() {
658        // Two fixes whose spans collide. C-1: keep one, drop the other.
659        let engine = engine_with(vec![
660            proposal("E001", 0, 6, "AA"),
661            proposal("E002", 3, 10, "BB"), // overlaps E001
662        ]);
663        let result = engine.fix(TEST_SRC, FixMode::Apply);
664        // Exactly one fix should be applied, the other should remain in
665        // `remaining_diagnostics` so callers can see it was not silently
666        // dropped.
667        assert_eq!(result.applied.len(), 1, "applied: {:?}", result.applied);
668        assert_eq!(
669            result.remaining_diagnostics.len(),
670            1,
671            "remaining: {:?}",
672            result.remaining_diagnostics
673        );
674    }
675
676    #[test]
677    fn dry_run_returns_original_source_but_records_applied() {
678        let engine = engine_with(vec![proposal("E001", 0, 6, "AA")]);
679        let result = engine.fix(TEST_SRC, FixMode::DryRun);
680        assert_eq!(result.source, TEST_SRC, "dry-run must not mutate source");
681        assert_eq!(result.applied.len(), 1);
682        assert!(result.applied[0].dry_run, "dry_run flag must be set");
683    }
684
685    #[test]
686    fn fix_with_threshold_rejects_nan() {
687        let engine = engine_with(vec![]);
688        assert!(matches!(
689            engine.fix_with_threshold(TEST_SRC, FixMode::Apply, Some(f32::NAN)),
690            Err(InvalidThreshold(_))
691        ));
692    }
693
694    #[test]
695    fn fix_with_threshold_rejects_out_of_range() {
696        let engine = engine_with(vec![]);
697        assert!(matches!(
698            engine.fix_with_threshold(TEST_SRC, FixMode::Apply, Some(-0.1)),
699            Err(InvalidThreshold(_))
700        ));
701        assert!(matches!(
702            engine.fix_with_threshold(TEST_SRC, FixMode::Apply, Some(1.1)),
703            Err(InvalidThreshold(_))
704        ));
705    }
706
707    #[test]
708    fn fix_with_threshold_accepts_boundaries() {
709        let engine = engine_with(vec![]);
710        assert!(
711            engine
712                .fix_with_threshold(TEST_SRC, FixMode::Apply, Some(0.0))
713                .is_ok()
714        );
715        assert!(
716            engine
717                .fix_with_threshold(TEST_SRC, FixMode::Apply, Some(1.0))
718                .is_ok()
719        );
720    }
721
722    #[test]
723    fn fixed_clock_yields_deterministic_timestamps() {
724        let engine = engine_with(vec![proposal("E001", 0, 6, "AA")]);
725        let r1 = engine.fix(TEST_SRC, FixMode::Apply);
726        let r2 = engine.fix(TEST_SRC, FixMode::Apply);
727        assert_eq!(r1.applied[0].timestamp, r2.applied[0].timestamp);
728    }
729
730    // H-3: fix_with_threshold must reject non-finite overrides in all
731    // directions, not just NaN. INFINITY and NEG_INFINITY are both caught
732    // by the range check; this test pins that behavior so a future refactor
733    // that uses e.g. `is_finite` instead of `contains + is_nan` cannot
734    // silently regress.
735    #[test]
736    fn fix_with_threshold_rejects_infinity() {
737        let engine = engine_with(vec![]);
738        assert!(matches!(
739            engine.fix_with_threshold(TEST_SRC, FixMode::Apply, Some(f32::INFINITY)),
740            Err(InvalidThreshold(_))
741        ));
742        assert!(matches!(
743            engine.fix_with_threshold(TEST_SRC, FixMode::Apply, Some(f32::NEG_INFINITY)),
744            Err(InvalidThreshold(_))
745        ));
746    }
747
748    // M-4: the confidence filter at `f.confidence >= threshold` is on the
749    // hot path of Engine::fix. These two tests pin the `>=` semantics so a
750    // future refactor that flips it to `>` (or vice versa) is caught.
751    #[test]
752    fn confidence_below_default_threshold_is_excluded() {
753        // Config::default().confidence_threshold == 0.95. A fix at 0.94
754        // must not be applied.
755        let engine = engine_with(vec![proposal_with_confidence("E001", 0, 6, "AA", 0.94)]);
756        let result = engine.fix(TEST_SRC, FixMode::Apply);
757        assert_eq!(result.applied.len(), 0);
758        // The below-threshold fix is a suggestion — it survives in
759        // remaining_diagnostics so the caller can surface it.
760        assert_eq!(result.remaining_diagnostics.len(), 1);
761    }
762
763    #[test]
764    fn confidence_at_default_threshold_is_included() {
765        // A fix at exactly 0.95 must be applied (inclusive threshold).
766        let engine = engine_with(vec![proposal_with_confidence("E001", 0, 6, "AA", 0.95)]);
767        let result = engine.fix(TEST_SRC, FixMode::Apply);
768        assert_eq!(result.applied.len(), 1);
769    }
770
771    // M-5: the zero-length-span filter (`!f.span.is_empty()`) in fix_inner
772    // is what masked the Phase 2 Span::new(0, 0) placeholders from the
773    // C-1 overlap guard. This test pins that guard explicitly so a future
774    // refactor that drops the filter is caught.
775    #[test]
776    fn zero_length_span_fix_is_filtered_before_sort() {
777        let engine = engine_with(vec![proposal("E001", 5, 5, "X")]);
778        let result = engine.fix(TEST_SRC, FixMode::Apply);
779        assert_eq!(result.applied.len(), 0);
780        // Source unchanged: no splice was attempted.
781        assert_eq!(result.source, TEST_SRC);
782    }
783
784    // L-4: all the other threshold tests go through fix_with_threshold
785    // (override path). This exercises the Config-supplied path explicitly
786    // so both branches of `fix_with_threshold_inner`'s threshold selection
787    // are covered.
788    #[test]
789    fn config_supplied_threshold_filters_proposals() {
790        let mut config = Config::default();
791        config.set_confidence_threshold(0.5).unwrap();
792        let engine = engine_with_config(
793            config,
794            vec![
795                proposal_with_confidence("E001", 0, 6, "AA", 0.4), // below
796                proposal_with_confidence("E002", 8, 14, "BB", 0.6), // above
797            ],
798        );
799        let result = engine.fix(TEST_SRC, FixMode::Apply);
800        // Only the 0.6 fix is applied.
801        assert_eq!(result.applied.len(), 1);
802        assert_eq!(result.applied[0].proposal.rule.as_str(), "E002");
803        // The 0.4 fix surfaces as a remaining diagnostic.
804        assert_eq!(result.remaining_diagnostics.len(), 1);
805    }
806
807    // Phase 3 Task 2: PageBreak candidates must reset the engine's
808    // PageContext accumulator. Without this, banner-validation rules on
809    // the second page would see portions from the first page, producing
810    // over-restrictive expected aggregates.
811    #[test]
812    fn lint_handles_multi_page_document_with_form_feed() {
813        let src: &[u8] = b"(SECRET//NOFORN) page 1 body.\nSECRET//NOFORN\n\x0c(CONFIDENTIAL) page 2 body.\nCONFIDENTIAL\n";
814        let engine = engine_with(vec![]);
815        let result = engine.lint(src);
816        // Stub rule with no proposals: clean lint, no panic, no parser
817        // error from the page-break candidate (which is filtered before
818        // parser.parse is called).
819        assert!(result.is_clean());
820    }
821
822    // F.1: PageContext reset semantics are observable.
823    //
824    // ContextRecorderRule captures the live `page_context.portion_count()`
825    // every time it's invoked. By running the engine over a multi-page
826    // document and inspecting the captured counts at each banner candidate,
827    // we prove that the engine resets PageContext at the page break instead
828    // of accumulating across pages.
829    #[derive(Clone)]
830    struct ContextRecorderRule {
831        observations: std::sync::Arc<std::sync::Mutex<Vec<(marque_ism::MarkingType, usize)>>>,
832    }
833
834    impl Rule for ContextRecorderRule {
835        fn id(&self) -> RuleId {
836            RuleId::new("RECORD")
837        }
838        fn name(&self) -> &'static str {
839            "page-context-recorder"
840        }
841        fn default_severity(&self) -> Severity {
842            Severity::Warn
843        }
844        fn check(&self, _attrs: &IsmAttributes, ctx: &RuleContext) -> Vec<Diagnostic> {
845            let count = ctx
846                .page_context
847                .as_ref()
848                .map(|pc| pc.portion_count())
849                .unwrap_or(0);
850            self.observations
851                .lock()
852                .unwrap()
853                .push((ctx.marking_type, count));
854            vec![]
855        }
856    }
857
858    struct RecorderSet(Vec<Box<dyn Rule>>);
859    impl RuleSet for RecorderSet {
860        fn rules(&self) -> &[Box<dyn Rule>] {
861            &self.0
862        }
863        fn schema_version(&self) -> &'static str {
864            "TEST"
865        }
866    }
867
868    #[test]
869    fn page_context_resets_observably_across_form_feed() {
870        use marque_ism::MarkingType;
871        let observations = std::sync::Arc::new(std::sync::Mutex::new(Vec::new()));
872        let rule = ContextRecorderRule {
873            observations: std::sync::Arc::clone(&observations),
874        };
875        let set: Box<dyn RuleSet> = Box::new(RecorderSet(vec![Box::new(rule)]));
876        let engine = Engine::with_clock(
877            Config::default(),
878            vec![set],
879            Box::new(FixedClock::new(
880                UNIX_EPOCH + Duration::from_secs(1_700_000_000),
881            )),
882        );
883
884        // Two pages, separated by a form feed:
885        //   Page 1: one portion + one banner
886        //   Page break (\f)
887        //   Page 2: one portion + one banner
888        //
889        // The recorder fires on every candidate that reaches the rule loop.
890        // For the page-1 banner we expect to see 1 accumulated portion.
891        // For the page-2 banner we expect to see 1 accumulated portion
892        // (NOT 2) — the form feed must have reset the context.
893        let src: &[u8] = b"(SECRET//NF) p1 text\nSECRET//NOFORN\n\x0c(CONFIDENTIAL//NF) p2\nCONFIDENTIAL//NOFORN\n";
894        let _ = engine.lint(src);
895
896        let obs = observations.lock().unwrap();
897        // The recorder ran once per non-PageBreak candidate. Filter to
898        // banners and check the page_context count each banner saw.
899        let banner_counts: Vec<usize> = obs
900            .iter()
901            .filter(|(kind, _)| *kind == MarkingType::Banner)
902            .map(|(_, count)| *count)
903            .collect();
904        assert_eq!(
905            banner_counts.len(),
906            2,
907            "expected 2 banner observations, got: {obs:?}"
908        );
909        assert_eq!(
910            banner_counts[0], 1,
911            "page-1 banner should see 1 accumulated portion"
912        );
913        assert_eq!(
914            banner_counts[1], 1,
915            "page-2 banner should see 1 accumulated portion (the page-1 \
916             portion must be cleared by the form feed)"
917        );
918    }
919
920    #[test]
921    fn page_context_lint_starts_fresh_on_each_call() {
922        // Calling Engine::lint twice on the same engine must produce a
923        // fresh PageContext for the second call — no cross-call accumulation.
924        use marque_ism::MarkingType;
925        let observations = std::sync::Arc::new(std::sync::Mutex::new(Vec::new()));
926        let rule = ContextRecorderRule {
927            observations: std::sync::Arc::clone(&observations),
928        };
929        let set: Box<dyn RuleSet> = Box::new(RecorderSet(vec![Box::new(rule)]));
930        let engine = Engine::with_clock(
931            Config::default(),
932            vec![set],
933            Box::new(FixedClock::new(
934                UNIX_EPOCH + Duration::from_secs(1_700_000_000),
935            )),
936        );
937        let src: &[u8] = b"(SECRET//NF) text\nSECRET//NOFORN\n";
938        let _ = engine.lint(src);
939        let _ = engine.lint(src);
940
941        let obs = observations.lock().unwrap();
942        // Both calls should see identical observations — if the second
943        // call leaked state from the first, the page-2 banner_count would
944        // double.
945        let banner_counts: Vec<usize> = obs
946            .iter()
947            .filter(|(kind, _)| *kind == MarkingType::Banner)
948            .map(|(_, count)| *count)
949            .collect();
950        assert_eq!(
951            banner_counts.len(),
952            2,
953            "two lint calls should produce two banner observations"
954        );
955        assert_eq!(banner_counts, vec![1, 1]);
956    }
957
958    // M6: FR-016 tiebreaker — same span, different rule IDs.
959    // The sort is (span.end DESC, span.start DESC, rule_id ASC, replacement ASC).
960    // When two fixes target the exact same span, rule_id ASC breaks the tie,
961    // and C-1 drops the second (overlapping) fix.
962    #[test]
963    fn fr016_same_span_different_rule_ids_picks_lower_rule_id() {
964        // Two proposals for span 0..6 with different rule IDs.
965        // "C001" < "E001" lexicographically, so C001 is kept and E001 dropped.
966        let engine = engine_with(vec![
967            proposal("E001", 0, 6, "BB"),
968            proposal("C001", 0, 6, "AA"),
969        ]);
970        let result = engine.fix(TEST_SRC, FixMode::Apply);
971        assert_eq!(result.applied.len(), 1);
972        assert_eq!(result.applied[0].proposal.rule.as_str(), "C001");
973        assert_eq!(result.applied[0].proposal.replacement.as_ref(), "AA");
974    }
975
976    // FR-016 tiebreaker — same span, same rule ID, different replacements.
977    #[test]
978    fn fr016_same_span_same_rule_picks_lower_replacement() {
979        let engine = engine_with(vec![
980            proposal("E001", 0, 6, "ZZZ"),
981            proposal("E001", 0, 6, "AAA"),
982        ]);
983        let result = engine.fix(TEST_SRC, FixMode::Apply);
984        assert_eq!(result.applied.len(), 1);
985        assert_eq!(result.applied[0].proposal.replacement.as_ref(), "AAA");
986    }
987}
marque_engine/engine.rs

marque_engine/
engine.rs