use crate::clock::{Clock, SystemClock};
use crate::errors::{EngineConstructionError, EngineError};
use crate::options::{FixOptions, LintOptions};
use crate::output::{FixResult, LintResult};
use crate::recognizer::shift_token_spans;
use crate::scheduler::schedule_rewrites;
use aho_corasick::AhoCorasick;
use marque_capco::CapcoScheme;
use marque_capco::provenance::DecoderProvenance;
use marque_config::Config;
use marque_ism::Span;
use marque_rules::{
AppliedFix, CORRECTIONS_MAP_CITATION, Confidence, Diagnostic, EnginePromotionToken,
FixProposal, FixSource, RuleId, RuleSet, Severity,
};
use marque_scheme::ambiguity::Parsed;
use marque_scheme::recognizer::{ParseContext, Recognizer};
use marque_scheme::{MarkingScheme, RewriteId};
use std::collections::HashMap;
use std::panic::AssertUnwindSafe;
use std::sync::Arc;
use web_time::Instant;
#[inline]
fn deadline_expired(deadline: Option<Instant>) -> bool {
deadline.is_some_and(|d| Instant::now() >= d)
}
const DECODER_RULE_ID: &str = "R001";
const DECODER_CITATION: &str = "CAPCO-2016 §A.6 p15";
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum FixMode {
Apply,
DryRun,
}
#[derive(Debug, Clone, Copy, PartialEq)]
pub struct InvalidThreshold(pub f32);
impl std::fmt::Display for InvalidThreshold {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"confidence threshold {} is outside [0.0, 1.0] or is NaN",
self.0
)
}
}
impl std::error::Error for InvalidThreshold {}
pub struct Engine {
config: Config,
rule_sets: Vec<Box<dyn RuleSet>>,
clock: Box<dyn Clock>,
corrections_arc: Option<Arc<HashMap<String, String>>>,
corrections_ac: Option<CachedAhoCorasick>,
scheduled_rewrites: Box<[RewriteId]>,
recognizer: Arc<dyn Recognizer<CapcoScheme>>,
#[cfg(feature = "corpus-override")]
corpus_override: Option<std::sync::Arc<marque_config::corpus_override::CorpusOverride>>,
}
struct CachedAhoCorasick {
ac: AhoCorasick,
active: Vec<(Box<str>, Box<str>)>,
}
impl Engine {
pub fn new<S: MarkingScheme>(
config: Config,
rule_sets: Vec<Box<dyn RuleSet>>,
scheme: S,
) -> Result<Self, EngineConstructionError> {
Self::with_clock(config, rule_sets, scheme, Box::new(SystemClock))
}
pub fn with_clock<S: MarkingScheme>(
mut config: Config,
rule_sets: Vec<Box<dyn RuleSet>>,
scheme: S,
clock: Box<dyn Clock>,
) -> Result<Self, EngineConstructionError> {
canonicalize_rule_overrides(&mut config, &rule_sets)?;
let scheduled_rewrites = schedule_rewrites(scheme.page_rewrites())?;
let corrections_arc = if config.corrections.is_empty() {
None
} else {
Some(Arc::new(std::mem::take(&mut config.corrections)))
};
let corrections_ac = corrections_arc.as_ref().and_then(|corrections| {
let mut active: Vec<(Box<str>, Box<str>)> = corrections
.iter()
.filter(|(k, v)| k != v && k.as_str() != "//")
.map(|(k, v)| (k.as_str().into(), v.as_str().into()))
.collect();
active.sort_by(|(a, _), (b, _)| a.cmp(b));
if active.is_empty() {
return None;
}
let patterns: Vec<&str> = active.iter().map(|(k, _)| k.as_ref()).collect();
match AhoCorasick::new(&patterns) {
Ok(ac) => Some(CachedAhoCorasick { ac, active }),
Err(e) => {
tracing::warn!(
"failed to build AhoCorasick automaton for corrections map \
({} patterns): {e}; pre-scanner text corrections disabled",
patterns.len()
);
None
}
}
});
Ok(Self {
config,
rule_sets,
clock,
corrections_arc,
corrections_ac,
scheduled_rewrites,
recognizer: Arc::new(crate::decoder::StrictOrDecoderRecognizer::new()),
#[cfg(feature = "corpus-override")]
corpus_override: None,
})
}
pub fn scheduled_rewrites(&self) -> &[RewriteId] {
&self.scheduled_rewrites
}
#[must_use = "with_recognizer returns a new Engine; the returned value must be bound for the override to take effect"]
pub fn with_recognizer(mut self, recognizer: Arc<dyn Recognizer<CapcoScheme>>) -> Self {
self.recognizer = recognizer;
self
}
#[cfg(feature = "corpus-override")]
#[must_use = "with_corpus_override returns a new Engine; the result must be bound to take effect — `engine.with_corpus_override(o)` alone leaves the engine without an override installed"]
pub fn with_corpus_override(
mut self,
override_data: std::sync::Arc<marque_config::corpus_override::CorpusOverride>,
) -> Self {
self.corpus_override = Some(override_data);
self
}
#[inline]
pub fn corpus_override_active(&self) -> bool {
#[cfg(feature = "corpus-override")]
{
self.corpus_override.is_some()
}
#[cfg(not(feature = "corpus-override"))]
{
false
}
}
pub fn lint(&self, source: &[u8]) -> LintResult {
self.lint_with_options(source, &LintOptions::default())
}
pub fn lint_with_options(&self, source: &[u8], opts: &LintOptions) -> LintResult {
use marque_core::Scanner;
use marque_ism::{MarkingType, PageContext};
use marque_rules::RuleContext;
if deadline_expired(opts.deadline) {
return LintResult {
truncated: true,
..Default::default()
};
}
let candidates = Scanner::scan(source);
let candidates_total = candidates.len();
let mut candidates_processed: usize = 0;
let corrections_arc = self.corrections_arc.clone();
let mut diagnostics = Vec::new();
let mut page_context = PageContext::new();
let mut page_context_arc: Option<Arc<PageContext>> = None;
let mut classification_floor: Option<u8> = None;
for candidate in &candidates {
if deadline_expired(opts.deadline) {
return LintResult {
diagnostics,
truncated: true,
candidates_processed,
candidates_total,
..Default::default()
};
}
candidates_processed += 1;
if candidate.kind == MarkingType::PageBreak {
page_context = PageContext::new();
page_context_arc = None;
classification_floor = None;
continue;
}
let preceded_by_whitespace = match candidate.span.start.checked_sub(1) {
None => true,
Some(prev_idx) => source
.get(prev_idx)
.map(|b| b.is_ascii_whitespace())
.unwrap_or(true),
};
let parse_cx = ParseContext {
strict_evidence: false,
zone: None,
position: None,
classification_floor,
as_of: None,
preceded_by_whitespace,
};
let start = candidate.span.start.min(source.len());
let end = candidate.span.end.min(source.len());
if start >= end {
continue;
}
let bytes = &source[start..end];
let Parsed::Unambiguous(mut marking) = self.recognizer.recognize(bytes, &parse_cx)
else {
continue;
};
shift_token_spans(&mut marking.0, start);
let provenance = marking.1.take();
let attrs = marking.0;
if provenance.is_none() {
if let Some(level) = attrs
.classification
.as_ref()
.map(|c| c.effective_level() as u8)
{
classification_floor = Some(match classification_floor {
Some(prev) => prev.max(level),
None => level,
});
}
}
if let Some(prov) = provenance {
let span = Span::new(start, end);
if let Some(diagnostic) = build_decoder_diagnostic(
span,
bytes,
&prov,
candidate.kind,
self.corpus_override_active(),
) {
diagnostics.push(diagnostic);
}
}
if candidate.kind == MarkingType::Portion {
page_context.add_portion(attrs.clone());
page_context_arc = None;
}
let ctx_page = if candidate.kind != MarkingType::Portion && !page_context.is_empty() {
Some(
page_context_arc
.get_or_insert_with(|| Arc::new(page_context.clone()))
.clone(),
)
} else {
None
};
let ctx = RuleContext {
marking_type: candidate.kind,
zone: None,
position: None,
page_context: ctx_page,
corrections: corrections_arc.clone(),
};
for rule_set in &self.rule_sets {
for rule in rule_set.rules() {
let configured_severity = self
.config
.rules
.overrides
.get(rule.id().as_str())
.and_then(|s| Severity::parse_config(s))
.unwrap_or(rule.default_severity());
if configured_severity == Severity::Off {
continue;
}
let rule_id = rule.id();
let catch_result =
std::panic::catch_unwind(AssertUnwindSafe(|| rule.check(&attrs, &ctx)));
let mut diags = match catch_result {
Ok(d) => d,
Err(payload) => {
let msg = panic_payload_to_string(&payload);
tracing::warn!(
target: "marque_engine::rule_panic",
rule = rule_id.as_str(),
error = %msg,
"rule check panicked; skipping this rule for the current candidate"
);
Vec::new()
}
};
for d in &mut diags {
d.severity = configured_severity;
}
diagnostics.extend(diags);
}
}
}
if let Some(cached) = &self.corrections_ac {
let c001_severity = self
.config
.rules
.overrides
.get("C001")
.and_then(|s| Severity::parse_config(s))
.unwrap_or(Severity::Fix);
if c001_severity != Severity::Off {
let existing_c001_spans: std::collections::HashSet<Span> = diagnostics
.iter()
.filter(|d| d.rule.as_str() == "C001")
.map(|d| d.span)
.collect();
for mat in cached.ac.find_iter(source) {
let span = Span::new(mat.start(), mat.end());
let (ref key, ref value) = cached.active[mat.pattern().as_usize()];
if !existing_c001_spans.contains(&span) {
let proposal = FixProposal::new(
RuleId::new("C001"),
FixSource::CorrectionsMap,
span,
key.as_ref(),
value.as_ref(),
marque_rules::Confidence::strict(1.0),
None,
);
diagnostics.push(Diagnostic::new(
RuleId::new("C001"),
c001_severity,
span,
format!("corrections map: {key:?} → {value:?}"),
CORRECTIONS_MAP_CITATION,
Some(proposal),
));
}
}
}
}
let threshold = self.config.confidence_threshold();
for d in &mut diagnostics {
if d.severity != Severity::Fix {
continue;
}
let Some(fix) = d.fix.as_ref() else { continue };
if fix.confidence.combined() < threshold {
d.severity = Severity::Suggest;
}
}
LintResult {
diagnostics,
truncated: false,
candidates_processed,
candidates_total,
..Default::default()
}
}
pub fn fix(&self, source: &[u8], mode: FixMode) -> FixResult {
self.fix_with_options(source, mode, &FixOptions::default())
.expect(
"fix() default options cannot fail: no deadline + pre-validated config threshold",
)
}
pub fn fix_with_threshold(
&self,
source: &[u8],
mode: FixMode,
threshold_override: Option<f32>,
) -> Result<FixResult, InvalidThreshold> {
let opts = FixOptions {
threshold_override,
..Default::default()
};
match self.fix_with_options(source, mode, &opts) {
Ok(result) => Ok(result),
Err(EngineError::InvalidThreshold(it)) => Err(it),
Err(EngineError::DeadlineExceeded { .. }) => {
unreachable!("fix_with_threshold cannot set a deadline through its signature")
}
}
}
pub fn fix_with_options(
&self,
source: &[u8],
mode: FixMode,
opts: &FixOptions,
) -> Result<FixResult, EngineError> {
let threshold = match opts.threshold_override {
Some(value) => {
if !(0.0..=1.0).contains(&value) || value.is_nan() {
return Err(EngineError::InvalidThreshold(InvalidThreshold(value)));
}
value
}
None => self.config.confidence_threshold(),
};
self.fix_inner(source, mode, threshold, opts.deadline)
}
fn fix_inner(
&self,
source: &[u8],
mode: FixMode,
threshold: f32,
deadline: Option<Instant>,
) -> Result<FixResult, EngineError> {
use std::collections::HashSet;
let lint_opts = LintOptions {
deadline,
..Default::default()
};
let lint1 = self.lint_with_options(source, &lint_opts);
if deadline_expired(deadline) {
return Err(EngineError::DeadlineExceeded {
partial_lint: lint1,
});
}
let (effective_source, pass1_applied) =
self.apply_text_corrections(source, &lint1, threshold, mode);
let lint = if !pass1_applied.is_empty() {
self.lint_with_options(&effective_source, &lint_opts)
} else {
lint1
};
if deadline_expired(deadline) {
return Err(EngineError::DeadlineExceeded { partial_lint: lint });
}
let mut fixes: Vec<_> = lint
.diagnostics
.iter()
.filter(|d| d.severity != Severity::Suggest)
.filter_map(|d| d.fix.as_ref())
.filter(|f| f.confidence.combined() >= threshold)
.filter(|f| !f.span.is_empty())
.collect();
fixes.sort_by(|a, b| {
b.span
.end
.cmp(&a.span.end)
.then(b.span.start.cmp(&a.span.start))
.then(a.rule.cmp(&b.rule))
.then(a.replacement.cmp(&b.replacement))
});
let mut kept_fixes: Vec<FixProposal> = Vec::with_capacity(fixes.len());
let mut next_window_end: Option<usize> = None;
for fix in &fixes {
let fits = match next_window_end {
Some(boundary) => fix.span.end <= boundary,
None => true,
};
if fits {
next_window_end = Some(fix.span.start);
kept_fixes.push((*fix).clone());
}
}
drop(fixes);
let classifier_id: Option<std::sync::Arc<str>> = self
.config
.user
.classifier_id
.as_deref()
.map(std::sync::Arc::from);
let dry_run = mode == FixMode::DryRun;
let now = self.clock.now();
let mut applied_keys: HashSet<(RuleId, Span)> = HashSet::with_capacity(kept_fixes.len());
let mut applied: Vec<AppliedFix> = Vec::with_capacity(kept_fixes.len());
if deadline_expired(deadline) {
return Err(EngineError::DeadlineExceeded { partial_lint: lint });
}
let mut deadline_aborted = false;
let output = match mode {
FixMode::Apply => {
let extra: usize = kept_fixes
.iter()
.map(|f| {
f.replacement
.len()
.saturating_sub(f.span.end - f.span.start)
})
.sum();
let mut buf = Vec::with_capacity(effective_source.len() + extra);
let mut last_end = 0usize;
for fix in kept_fixes.iter().rev() {
if deadline_expired(deadline) {
deadline_aborted = true;
break;
}
buf.extend_from_slice(&effective_source[last_end..fix.span.start]);
buf.extend_from_slice(fix.replacement.as_bytes());
last_end = fix.span.end;
}
if !deadline_aborted {
buf.extend_from_slice(&effective_source[last_end..]);
}
if !deadline_aborted {
for fix in kept_fixes {
if deadline_expired(deadline) {
deadline_aborted = true;
break;
}
applied_keys.insert((fix.rule.clone(), fix.span));
applied.push(AppliedFix::__engine_promote(
fix,
now,
classifier_id.clone(),
dry_run,
None, engine_promotion_token(),
));
}
}
buf
}
FixMode::DryRun => {
for fix in kept_fixes {
if deadline_expired(deadline) {
deadline_aborted = true;
break;
}
applied_keys.insert((fix.rule.clone(), fix.span));
applied.push(AppliedFix::__engine_promote(
fix,
now,
classifier_id.clone(),
dry_run,
None,
engine_promotion_token(),
));
}
source.to_vec()
}
};
if deadline_aborted {
return Err(EngineError::DeadlineExceeded { partial_lint: lint });
}
let mut all_applied = pass1_applied;
all_applied.extend(applied);
let remaining_diagnostics = lint
.diagnostics
.into_iter()
.filter(|d| {
!d.fix
.as_ref()
.is_some_and(|f| applied_keys.contains(&(f.rule.clone(), f.span)))
})
.collect();
Ok(FixResult {
source: output,
applied: all_applied,
remaining_diagnostics,
})
}
fn apply_text_corrections(
&self,
source: &[u8],
lint: &LintResult,
threshold: f32,
mode: FixMode,
) -> (Vec<u8>, Vec<AppliedFix>) {
let mut text_fixes: Vec<&FixProposal> = lint
.diagnostics
.iter()
.filter(|d| d.rule.as_str() == "C001")
.filter(|d| d.severity != Severity::Suggest)
.filter_map(|d| d.fix.as_ref())
.filter(|f| f.source == FixSource::CorrectionsMap)
.filter(|f| f.confidence.combined() >= threshold)
.filter(|f| !f.span.is_empty())
.collect();
if text_fixes.is_empty() {
return (source.to_vec(), Vec::new());
}
text_fixes.sort_by(|a, b| {
b.span
.end
.cmp(&a.span.end)
.then(b.span.start.cmp(&a.span.start))
.then(a.rule.cmp(&b.rule))
.then(a.replacement.cmp(&b.replacement))
});
let mut kept: Vec<&FixProposal> = Vec::new();
let mut next_end: Option<usize> = None;
for fix in &text_fixes {
let fits = next_end.is_none_or(|b| fix.span.end <= b);
if fits {
next_end = Some(fix.span.start);
kept.push(*fix);
}
}
let classifier_id: Option<Arc<str>> =
self.config.user.classifier_id.as_deref().map(Arc::from);
let dry_run = mode == FixMode::DryRun;
let now = self.clock.now();
let mut buf = source.to_vec();
let mut applied = Vec::with_capacity(kept.len());
for fix in &kept {
buf.splice(fix.span.start..fix.span.end, fix.replacement.bytes());
applied.push(AppliedFix::__engine_promote(
(*fix).clone(),
now,
classifier_id.clone(),
dry_run,
None,
engine_promotion_token(),
));
}
(buf, applied)
}
}
#[inline]
fn engine_promotion_token() -> EnginePromotionToken {
EnginePromotionToken::__engine_construct()
}
fn build_decoder_diagnostic(
span: Span,
original_bytes: &[u8],
provenance: &DecoderProvenance,
_kind: marque_ism::MarkingType,
corpus_override_active: bool,
) -> Option<Diagnostic> {
use marque_rules::confidence::{FeatureContribution, FeatureId};
let original = std::str::from_utf8(original_bytes).ok()?;
let replacement = std::str::from_utf8(&provenance.canonical_bytes).ok()?;
if original == replacement {
return None;
}
let mut features: Vec<FeatureContribution> = provenance.features.to_vec();
if corpus_override_active {
features.push(FeatureContribution {
id: FeatureId::CorpusOverrideInEffect,
delta: 0.0,
});
}
let (severity, rule_axis, fix_source) = match provenance.fix_source {
FixSource::DecoderClassificationHeuristic => (
Severity::Warn,
HEURISTIC_RULE_AXIS_CAP,
FixSource::DecoderClassificationHeuristic,
),
_ => (Severity::Fix, 1.0, FixSource::DecoderPosterior),
};
let confidence = Confidence {
recognition: provenance.recognition_score(),
rule: rule_axis,
region: None,
runner_up_ratio: provenance.runner_up_ratio,
features,
};
let rule = RuleId::new(DECODER_RULE_ID);
let _ = original;
let proposal = FixProposal::new(
rule.clone(),
fix_source,
span,
"",
replacement,
confidence,
None,
);
Some(Diagnostic::new(
rule,
severity,
span,
format!("decoder-recognized canonical form: {replacement:?}"),
DECODER_CITATION,
Some(proposal),
))
}
const HEURISTIC_RULE_AXIS_CAP: f32 = 0.95;
fn canonicalize_rule_overrides(
config: &mut Config,
rule_sets: &[Box<dyn RuleSet>],
) -> Result<(), EngineConstructionError> {
if config.rules.overrides.is_empty() {
return Ok(());
}
let mut known: HashMap<&'static str, &'static str> = HashMap::new();
for rule_set in rule_sets {
for rule in rule_set.rules() {
let id_str = rule.id().as_str();
let name = rule.name();
known.insert(id_str, id_str);
known.insert(name, id_str);
}
}
let raw = std::mem::take(&mut config.rules.overrides);
let mut by_rule: HashMap<&'static str, (String, String)> = HashMap::new();
for (key, value) in raw {
match known.get(key.as_str()) {
Some(&canonical_id) => {
if let Some((prev_key, prev_sev)) = by_rule.get(canonical_id) {
if prev_sev != &value {
return Err(EngineConstructionError::ConflictingRuleOverride {
rule_id: canonical_id.to_owned(),
keys: Box::new([prev_key.clone(), key]),
severities: Box::new([prev_sev.clone(), value]),
});
}
} else {
by_rule.insert(canonical_id, (key, value));
}
}
None => {
let did_you_mean = suggest_closest(&key, known.keys().copied());
return Err(EngineConstructionError::UnknownRuleOverride { key, did_you_mean });
}
}
}
config.rules.overrides = by_rule
.into_iter()
.map(|(id, (_, sev))| (id.to_owned(), sev))
.collect();
Ok(())
}
fn panic_payload_to_string(
payload: &Box<dyn std::any::Any + Send + 'static>,
) -> std::borrow::Cow<'static, str> {
if let Some(s) = payload.downcast_ref::<&'static str>() {
std::borrow::Cow::Borrowed(*s)
} else if let Some(s) = payload.downcast_ref::<String>() {
std::borrow::Cow::Owned(s.clone())
} else {
std::borrow::Cow::Borrowed("<unstringifiable panic payload>")
}
}
fn suggest_closest<'a, I>(needle: &str, candidates: I) -> Option<String>
where
I: Iterator<Item = &'a str>,
{
let max_distance = match needle.len() {
0..=3 => 1,
4..=7 => 2,
_ => 3,
};
let mut best: Option<(&'a str, usize)> = None;
for cand in candidates {
let dist = levenshtein(needle, cand);
if dist > max_distance {
continue;
}
match best {
Some((_, prev_dist)) if dist >= prev_dist => {}
_ => best = Some((cand, dist)),
}
}
best.map(|(cand, _)| cand.to_owned())
}
fn levenshtein(a: &str, b: &str) -> usize {
let a = a.as_bytes();
let b = b.as_bytes();
let (m, n) = (a.len(), b.len());
if m == 0 {
return n;
}
if n == 0 {
return m;
}
let mut prev: Vec<usize> = (0..=n).collect();
let mut curr: Vec<usize> = vec![0; n + 1];
for i in 1..=m {
curr[0] = i;
for j in 1..=n {
let cost = if a[i - 1] == b[j - 1] { 0 } else { 1 };
curr[j] = (prev[j] + 1).min(curr[j - 1] + 1).min(prev[j - 1] + cost);
}
std::mem::swap(&mut prev, &mut curr);
}
prev[n]
}
#[cfg(test)]
#[cfg_attr(coverage_nightly, coverage(off))]
mod tests {
use super::*;
use crate::clock::FixedClock;
use marque_ism::IsmAttributes;
use marque_rules::{
Diagnostic, FixProposal, FixSource, Rule, RuleContext, RuleId, RuleSet, Severity,
};
use std::time::{Duration, UNIX_EPOCH};
#[test]
fn heuristic_rule_axis_cap_matches_default_threshold() {
let default_threshold = Config::default().confidence_threshold();
assert!(
(HEURISTIC_RULE_AXIS_CAP - default_threshold).abs() < 1e-6,
"HEURISTIC_RULE_AXIS_CAP={HEURISTIC_RULE_AXIS_CAP} must equal \
Config::default().confidence_threshold()={default_threshold}; \
a divergence requires an intentional governance change recorded \
in the cap's doc comment"
);
}
struct StubRule {
id: &'static str,
proposals: Vec<FixProposal>,
}
impl Rule for StubRule {
fn id(&self) -> RuleId {
RuleId::new(self.id)
}
fn name(&self) -> &'static str {
"stub"
}
fn default_severity(&self) -> Severity {
Severity::Fix
}
fn check(&self, _attrs: &IsmAttributes, _ctx: &RuleContext) -> Vec<Diagnostic> {
self.proposals
.iter()
.map(|p| {
Diagnostic::new(
p.rule.clone(),
Severity::Fix,
p.span,
"stub",
"TEST",
Some(p.clone()),
)
})
.collect()
}
}
struct StubSet(Vec<Box<dyn Rule>>);
impl RuleSet for StubSet {
fn rules(&self) -> &[Box<dyn Rule>] {
&self.0
}
fn schema_version(&self) -> &'static str {
"TEST"
}
}
fn proposal(rule: &'static str, start: usize, end: usize, replacement: &str) -> FixProposal {
proposal_with_confidence(rule, start, end, replacement, 1.0)
}
fn proposal_with_confidence(
rule: &'static str,
start: usize,
end: usize,
replacement: &str,
confidence: f32,
) -> FixProposal {
FixProposal::new(
RuleId::new(rule),
FixSource::BuiltinRule,
Span::new(start, end),
"x",
replacement,
marque_rules::Confidence::strict(confidence),
None,
)
}
fn engine_with(proposals: Vec<FixProposal>) -> Engine {
engine_with_config(Config::default(), proposals)
}
fn engine_with_config(config: Config, proposals: Vec<FixProposal>) -> Engine {
let stub = StubRule {
id: "TEST",
proposals,
};
let set: Box<dyn RuleSet> = Box::new(StubSet(vec![Box::new(stub)]));
Engine::with_clock(
config,
vec![set],
marque_capco::scheme::CapcoScheme::new(),
Box::new(FixedClock::new(
UNIX_EPOCH + Duration::from_secs(1_700_000_000),
)),
)
.expect("default CAPCO scheme has no rewrite cycles")
}
const TEST_SRC: &[u8] = b"SECRET//NOFORN ";
#[test]
fn fix_applies_disjoint_fixes_in_reverse_order() {
let engine = engine_with(vec![
proposal("E001", 0, 6, "AA"), proposal("E002", 8, 14, "BB"), ]);
let result = engine.fix(TEST_SRC, FixMode::Apply);
let out = String::from_utf8(result.source).unwrap();
assert!(out.starts_with("AA//BB"), "got: {out:?}");
assert_eq!(result.applied.len(), 2);
}
#[test]
fn overlap_guard_drops_overlapping_fix() {
let engine = engine_with(vec![
proposal("E001", 0, 6, "AA"),
proposal("E002", 3, 10, "BB"), ]);
let result = engine.fix(TEST_SRC, FixMode::Apply);
assert_eq!(result.applied.len(), 1, "applied: {:?}", result.applied);
assert_eq!(
result.remaining_diagnostics.len(),
1,
"remaining: {:?}",
result.remaining_diagnostics
);
}
#[test]
fn dry_run_returns_original_source_but_records_applied() {
let engine = engine_with(vec![proposal("E001", 0, 6, "AA")]);
let result = engine.fix(TEST_SRC, FixMode::DryRun);
assert_eq!(result.source, TEST_SRC, "dry-run must not mutate source");
assert_eq!(result.applied.len(), 1);
assert!(result.applied[0].dry_run, "dry_run flag must be set");
}
#[test]
fn fix_with_threshold_rejects_nan() {
let engine = engine_with(vec![]);
assert!(matches!(
engine.fix_with_threshold(TEST_SRC, FixMode::Apply, Some(f32::NAN)),
Err(InvalidThreshold(_))
));
}
#[test]
fn fix_with_threshold_rejects_out_of_range() {
let engine = engine_with(vec![]);
assert!(matches!(
engine.fix_with_threshold(TEST_SRC, FixMode::Apply, Some(-0.1)),
Err(InvalidThreshold(_))
));
assert!(matches!(
engine.fix_with_threshold(TEST_SRC, FixMode::Apply, Some(1.1)),
Err(InvalidThreshold(_))
));
}
#[test]
fn fix_with_threshold_accepts_boundaries() {
let engine = engine_with(vec![]);
assert!(
engine
.fix_with_threshold(TEST_SRC, FixMode::Apply, Some(0.0))
.is_ok()
);
assert!(
engine
.fix_with_threshold(TEST_SRC, FixMode::Apply, Some(1.0))
.is_ok()
);
}
#[test]
fn fixed_clock_yields_deterministic_timestamps() {
let engine = engine_with(vec![proposal("E001", 0, 6, "AA")]);
let r1 = engine.fix(TEST_SRC, FixMode::Apply);
let r2 = engine.fix(TEST_SRC, FixMode::Apply);
assert_eq!(r1.applied[0].timestamp, r2.applied[0].timestamp);
}
#[test]
fn fix_with_threshold_rejects_infinity() {
let engine = engine_with(vec![]);
assert!(matches!(
engine.fix_with_threshold(TEST_SRC, FixMode::Apply, Some(f32::INFINITY)),
Err(InvalidThreshold(_))
));
assert!(matches!(
engine.fix_with_threshold(TEST_SRC, FixMode::Apply, Some(f32::NEG_INFINITY)),
Err(InvalidThreshold(_))
));
}
#[test]
fn confidence_below_default_threshold_is_excluded() {
let engine = engine_with(vec![proposal_with_confidence("E001", 0, 6, "AA", 0.94)]);
let result = engine.fix(TEST_SRC, FixMode::Apply);
assert_eq!(result.applied.len(), 0);
assert_eq!(result.remaining_diagnostics.len(), 1);
}
#[test]
fn lint_rewrites_below_threshold_fix_severity_to_suggest() {
let engine = engine_with(vec![proposal_with_confidence("E001", 0, 6, "AA", 0.5)]);
let lint = engine.lint(TEST_SRC);
assert_eq!(lint.diagnostics.len(), 1);
assert_eq!(lint.diagnostics[0].severity, Severity::Suggest);
assert!(
lint.diagnostics[0].fix.is_some(),
"the candidate fix must stay attached so the renderer can surface it"
);
assert_eq!(lint.suggest_count(), 1);
let fix_result = engine.fix(TEST_SRC, FixMode::Apply);
assert_eq!(fix_result.applied.len(), 0);
}
#[test]
fn lint_does_not_rewrite_at_threshold_boundary() {
let engine = engine_with(vec![proposal_with_confidence("E001", 0, 6, "AA", 0.95)]);
let lint = engine.lint(TEST_SRC);
assert_eq!(lint.diagnostics.len(), 1);
assert_eq!(lint.diagnostics[0].severity, Severity::Fix);
}
#[test]
fn lint_post_pass_leaves_fix_severity_with_no_fix_payload_alone() {
struct FixWithoutProposalRule;
impl Rule for FixWithoutProposalRule {
fn id(&self) -> RuleId {
RuleId::new("E997")
}
fn name(&self) -> &'static str {
"stub-fix-no-proposal"
}
fn default_severity(&self) -> Severity {
Severity::Fix
}
fn check(&self, _attrs: &IsmAttributes, _ctx: &RuleContext) -> Vec<Diagnostic> {
vec![Diagnostic::new(
RuleId::new("E997"),
Severity::Fix,
Span::new(0, 6),
"fix-severity diagnostic with no proposal",
"TEST",
None,
)]
}
}
let set: Box<dyn RuleSet> = Box::new(StubSet(vec![Box::new(FixWithoutProposalRule)]));
let engine = Engine::with_clock(
Config::default(),
vec![set],
marque_capco::scheme::CapcoScheme::new(),
Box::new(FixedClock::new(
UNIX_EPOCH + Duration::from_secs(1_700_000_000),
)),
)
.expect("default CAPCO scheme has no rewrite cycles");
let lint = engine.lint(TEST_SRC);
assert_eq!(lint.diagnostics.len(), 1);
assert_eq!(
lint.diagnostics[0].severity,
Severity::Fix,
"Fix-severity diagnostic with no fix payload must NOT be rewritten to Suggest",
);
assert!(lint.diagnostics[0].fix.is_none());
}
#[test]
fn fix_excludes_explicit_suggest_severity_from_auto_apply() {
struct SuggestRule;
impl Rule for SuggestRule {
fn id(&self) -> RuleId {
RuleId::new("S999")
}
fn name(&self) -> &'static str {
"stub-suggest"
}
fn default_severity(&self) -> Severity {
Severity::Suggest
}
fn check(&self, _attrs: &IsmAttributes, _ctx: &RuleContext) -> Vec<Diagnostic> {
let proposal = FixProposal::new(
RuleId::new("S999"),
FixSource::BuiltinRule,
Span::new(0, 6),
"SECRET",
"TOP SECRET",
marque_rules::Confidence::strict(1.0),
None,
);
vec![Diagnostic::new(
RuleId::new("S999"),
Severity::Suggest,
Span::new(0, 6),
"explicit suggest with high confidence",
"TEST",
Some(proposal),
)]
}
}
let set: Box<dyn RuleSet> = Box::new(StubSet(vec![Box::new(SuggestRule)]));
let engine = Engine::with_clock(
Config::default(),
vec![set],
marque_capco::scheme::CapcoScheme::new(),
Box::new(FixedClock::new(
UNIX_EPOCH + Duration::from_secs(1_700_000_000),
)),
)
.expect("default CAPCO scheme has no rewrite cycles");
let lint = engine.lint(TEST_SRC);
assert_eq!(lint.diagnostics.len(), 1);
assert_eq!(lint.diagnostics[0].severity, Severity::Suggest);
let fix_result = engine.fix(TEST_SRC, FixMode::Apply);
assert_eq!(
fix_result.applied.len(),
0,
"explicit Suggest-severity fix must not auto-apply regardless of confidence"
);
}
#[test]
fn confidence_at_default_threshold_is_included() {
let engine = engine_with(vec![proposal_with_confidence("E001", 0, 6, "AA", 0.95)]);
let result = engine.fix(TEST_SRC, FixMode::Apply);
assert_eq!(result.applied.len(), 1);
}
#[test]
fn zero_length_span_fix_is_filtered_before_sort() {
let engine = engine_with(vec![proposal("E001", 5, 5, "X")]);
let result = engine.fix(TEST_SRC, FixMode::Apply);
assert_eq!(result.applied.len(), 0);
assert_eq!(result.source, TEST_SRC);
}
#[test]
fn config_supplied_threshold_filters_proposals() {
let mut config = Config::default();
config.set_confidence_threshold(0.5).unwrap();
let engine = engine_with_config(
config,
vec![
proposal_with_confidence("E001", 0, 6, "AA", 0.4), proposal_with_confidence("E002", 8, 14, "BB", 0.6), ],
);
let result = engine.fix(TEST_SRC, FixMode::Apply);
assert_eq!(result.applied.len(), 1);
assert_eq!(result.applied[0].proposal.rule.as_str(), "E002");
assert_eq!(result.remaining_diagnostics.len(), 1);
}
#[test]
fn lint_handles_multi_page_document_with_form_feed() {
let src: &[u8] = b"(SECRET//NOFORN) page 1 body.\nSECRET//NOFORN\n\x0c(CONFIDENTIAL) page 2 body.\nCONFIDENTIAL\n";
let engine = engine_with(vec![]);
let result = engine.lint(src);
assert!(result.is_clean());
}
#[derive(Clone)]
struct ContextRecorderRule {
observations: std::sync::Arc<std::sync::Mutex<Vec<(marque_ism::MarkingType, usize)>>>,
}
impl Rule for ContextRecorderRule {
fn id(&self) -> RuleId {
RuleId::new("RECORD")
}
fn name(&self) -> &'static str {
"page-context-recorder"
}
fn default_severity(&self) -> Severity {
Severity::Warn
}
fn check(&self, _attrs: &IsmAttributes, ctx: &RuleContext) -> Vec<Diagnostic> {
let count = ctx
.page_context
.as_ref()
.map(|pc| pc.portion_count())
.unwrap_or(0);
self.observations
.lock()
.unwrap()
.push((ctx.marking_type, count));
vec![]
}
}
struct RecorderSet(Vec<Box<dyn Rule>>);
impl RuleSet for RecorderSet {
fn rules(&self) -> &[Box<dyn Rule>] {
&self.0
}
fn schema_version(&self) -> &'static str {
"TEST"
}
}
#[test]
fn page_context_resets_observably_across_form_feed() {
use marque_ism::MarkingType;
let observations = std::sync::Arc::new(std::sync::Mutex::new(Vec::new()));
let rule = ContextRecorderRule {
observations: std::sync::Arc::clone(&observations),
};
let set: Box<dyn RuleSet> = Box::new(RecorderSet(vec![Box::new(rule)]));
let engine = Engine::with_clock(
Config::default(),
vec![set],
marque_capco::scheme::CapcoScheme::new(),
Box::new(FixedClock::new(
UNIX_EPOCH + Duration::from_secs(1_700_000_000),
)),
)
.expect("default CAPCO scheme has no rewrite cycles");
let src: &[u8] = b"(SECRET//NF) p1 text\nSECRET//NOFORN\n\x0c(CONFIDENTIAL//NF) p2\nCONFIDENTIAL//NOFORN\n";
let _ = engine.lint(src);
let obs = observations.lock().unwrap();
let banner_counts: Vec<usize> = obs
.iter()
.filter(|(kind, _)| *kind == MarkingType::Banner)
.map(|(_, count)| *count)
.collect();
assert_eq!(
banner_counts.len(),
2,
"expected 2 banner observations, got: {obs:?}"
);
assert_eq!(
banner_counts[0], 1,
"page-1 banner should see 1 accumulated portion"
);
assert_eq!(
banner_counts[1], 1,
"page-2 banner should see 1 accumulated portion (the page-1 \
portion must be cleared by the form feed)"
);
}
#[test]
fn page_context_lint_starts_fresh_on_each_call() {
use marque_ism::MarkingType;
let observations = std::sync::Arc::new(std::sync::Mutex::new(Vec::new()));
let rule = ContextRecorderRule {
observations: std::sync::Arc::clone(&observations),
};
let set: Box<dyn RuleSet> = Box::new(RecorderSet(vec![Box::new(rule)]));
let engine = Engine::with_clock(
Config::default(),
vec![set],
marque_capco::scheme::CapcoScheme::new(),
Box::new(FixedClock::new(
UNIX_EPOCH + Duration::from_secs(1_700_000_000),
)),
)
.expect("default CAPCO scheme has no rewrite cycles");
let src: &[u8] = b"(SECRET//NF) text\nSECRET//NOFORN\n";
let _ = engine.lint(src);
let _ = engine.lint(src);
let obs = observations.lock().unwrap();
let banner_counts: Vec<usize> = obs
.iter()
.filter(|(kind, _)| *kind == MarkingType::Banner)
.map(|(_, count)| *count)
.collect();
assert_eq!(
banner_counts.len(),
2,
"two lint calls should produce two banner observations"
);
assert_eq!(banner_counts, vec![1, 1]);
}
#[test]
fn fr016_same_span_different_rule_ids_picks_lower_rule_id() {
let engine = engine_with(vec![
proposal("E001", 0, 6, "BB"),
proposal("C001", 0, 6, "AA"),
]);
let result = engine.fix(TEST_SRC, FixMode::Apply);
assert_eq!(result.applied.len(), 1);
assert_eq!(result.applied[0].proposal.rule.as_str(), "C001");
assert_eq!(result.applied[0].proposal.replacement.as_ref(), "AA");
}
#[test]
fn fr016_same_span_same_rule_picks_lower_replacement() {
let engine = engine_with(vec![
proposal("E001", 0, 6, "ZZZ"),
proposal("E001", 0, 6, "AAA"),
]);
let result = engine.fix(TEST_SRC, FixMode::Apply);
assert_eq!(result.applied.len(), 1);
assert_eq!(result.applied[0].proposal.replacement.as_ref(), "AAA");
}
struct NamedStub {
id: &'static str,
name: &'static str,
}
impl Rule for NamedStub {
fn id(&self) -> RuleId {
RuleId::new(self.id)
}
fn name(&self) -> &'static str {
self.name
}
fn default_severity(&self) -> Severity {
Severity::Warn
}
fn check(&self, _attrs: &IsmAttributes, _ctx: &RuleContext) -> Vec<Diagnostic> {
vec![]
}
}
fn named_rule_set(rules: &[(&'static str, &'static str)]) -> Box<dyn RuleSet> {
let rules: Vec<Box<dyn Rule>> = rules
.iter()
.map(|(id, name)| Box::new(NamedStub { id, name }) as Box<dyn Rule>)
.collect();
Box::new(StubSet(rules))
}
fn config_with_overrides(pairs: &[(&str, &str)]) -> Config {
let mut config = Config::default();
for (k, v) in pairs {
config
.rules
.overrides
.insert((*k).to_owned(), (*v).to_owned());
}
config
}
#[test]
fn canonicalize_accepts_rule_id_form_unchanged() {
let mut config = config_with_overrides(&[("E001", "warn")]);
let sets = vec![named_rule_set(&[("E001", "portion-mark-in-banner")])];
canonicalize_rule_overrides(&mut config, &sets).expect("should succeed");
assert_eq!(
config.rules.overrides.get("E001"),
Some(&"warn".to_owned()),
"ID-form override keeps its key"
);
}
#[test]
fn canonicalize_accepts_rule_name_form_and_resolves_to_id() {
let mut config = config_with_overrides(&[("portion-mark-in-banner", "error")]);
let sets = vec![named_rule_set(&[("E001", "portion-mark-in-banner")])];
canonicalize_rule_overrides(&mut config, &sets).expect("should succeed");
assert_eq!(
config.rules.overrides.get("E001"),
Some(&"error".to_owned()),
"name-form override resolves to canonical ID"
);
assert!(
!config
.rules
.overrides
.contains_key("portion-mark-in-banner"),
"pre-canonicalization name key must not survive"
);
}
#[test]
fn canonicalize_rejects_unknown_key_with_suggestion_for_near_miss() {
let mut config = config_with_overrides(&[("E00l", "warn")]); let sets = vec![named_rule_set(&[("E001", "portion-mark-in-banner")])];
let err = canonicalize_rule_overrides(&mut config, &sets).unwrap_err();
match err {
EngineConstructionError::UnknownRuleOverride { key, did_you_mean } => {
assert_eq!(key, "E00l");
assert_eq!(
did_you_mean.as_deref(),
Some("E001"),
"single-character typo should suggest the canonical ID"
);
}
other => panic!("expected UnknownRuleOverride, got {other:?}"),
}
}
#[test]
fn canonicalize_rejects_unknown_key_without_suggestion_when_nothing_close() {
let mut config = config_with_overrides(&[("totally-made-up-rule-name", "error")]);
let sets = vec![named_rule_set(&[("E001", "portion-mark-in-banner")])];
let err = canonicalize_rule_overrides(&mut config, &sets).unwrap_err();
match err {
EngineConstructionError::UnknownRuleOverride { key, did_you_mean } => {
assert_eq!(key, "totally-made-up-rule-name");
assert!(
did_you_mean.is_none(),
"distant misses must not emit a suggestion; got {did_you_mean:?}"
);
}
other => panic!("expected UnknownRuleOverride, got {other:?}"),
}
}
#[test]
fn canonicalize_rejects_conflicting_id_and_name_forms_with_different_severity() {
let mut config =
config_with_overrides(&[("E001", "warn"), ("portion-mark-in-banner", "error")]);
let sets = vec![named_rule_set(&[("E001", "portion-mark-in-banner")])];
let err = canonicalize_rule_overrides(&mut config, &sets).unwrap_err();
match err {
EngineConstructionError::ConflictingRuleOverride {
rule_id,
keys,
severities,
} => {
assert_eq!(rule_id, "E001");
let k: std::collections::HashSet<&str> = keys.iter().map(|s| s.as_str()).collect();
assert!(k.contains("E001"));
assert!(k.contains("portion-mark-in-banner"));
let s: std::collections::HashSet<&str> =
severities.iter().map(|s| s.as_str()).collect();
assert!(s.contains("warn"));
assert!(s.contains("error"));
}
other => panic!("expected ConflictingRuleOverride, got {other:?}"),
}
}
#[test]
fn canonicalize_accepts_duplicate_forms_with_same_severity() {
let mut config =
config_with_overrides(&[("E001", "warn"), ("portion-mark-in-banner", "warn")]);
let sets = vec![named_rule_set(&[("E001", "portion-mark-in-banner")])];
canonicalize_rule_overrides(&mut config, &sets)
.expect("duplicate forms with same severity must succeed");
assert_eq!(config.rules.overrides.len(), 1);
assert_eq!(config.rules.overrides.get("E001"), Some(&"warn".to_owned()));
}
#[test]
fn canonicalize_accepts_overrides_across_multiple_rule_sets() {
let mut config = config_with_overrides(&[
("portion-mark-in-banner", "error"), ("M500", "warn"), ]);
let sets = vec![
named_rule_set(&[("E001", "portion-mark-in-banner")]),
named_rule_set(&[("M500", "some-other-domain-rule")]),
];
canonicalize_rule_overrides(&mut config, &sets).expect("should succeed");
assert_eq!(
config.rules.overrides.get("E001"),
Some(&"error".to_owned())
);
assert_eq!(config.rules.overrides.get("M500"), Some(&"warn".to_owned()));
}
#[test]
fn canonicalize_empty_overrides_is_noop() {
let mut config = Config::default();
let sets = vec![named_rule_set(&[("E001", "portion-mark-in-banner")])];
canonicalize_rule_overrides(&mut config, &sets).expect("empty overrides must succeed");
assert!(config.rules.overrides.is_empty());
}
#[test]
fn unknown_rule_override_exit_code_is_dataerr() {
let err = EngineConstructionError::UnknownRuleOverride {
key: "E999".into(),
did_you_mean: None,
};
assert_eq!(err.exit_code(), 65, "EX_DATAERR for user-config errors");
}
#[test]
fn conflicting_rule_override_exit_code_is_dataerr() {
let err = EngineConstructionError::ConflictingRuleOverride {
rule_id: "E001".into(),
keys: Box::new(["E001".into(), "portion-mark-in-banner".into()]),
severities: Box::new(["warn".into(), "error".into()]),
};
assert_eq!(err.exit_code(), 65);
}
#[test]
fn rewrite_cycle_exit_code_is_unavailable() {
use marque_scheme::CategoryId;
let err = EngineConstructionError::RewriteCycle {
axis: CategoryId(0),
members: Box::new(["a", "b"]),
};
assert_eq!(err.exit_code(), 69);
}
#[test]
fn levenshtein_matches_reference_values() {
assert_eq!(super::levenshtein("", ""), 0);
assert_eq!(super::levenshtein("E001", "E001"), 0);
assert_eq!(super::levenshtein("E001", "E002"), 1);
assert_eq!(super::levenshtein("E001", "E00l"), 1);
assert_eq!(super::levenshtein("kitten", "sitting"), 3);
assert_eq!(super::levenshtein("", "abc"), 3);
assert_eq!(super::levenshtein("abc", ""), 3);
}
#[test]
fn suggest_closest_prefers_smaller_distance() {
let cands = ["E001", "E002", "E010"];
assert_eq!(
super::suggest_closest("E00l", cands.iter().copied()),
Some("E001".to_owned())
);
}
#[test]
fn suggest_closest_returns_none_when_nothing_is_close_enough() {
let cands = ["portion-mark-in-banner", "missing-usa-trigraph"];
assert!(super::suggest_closest("xyz", cands.iter().copied()).is_none());
}
}