marque_engine/decoder.rs
1// SPDX-FileCopyrightText: 2026 Knitli Inc.
2//
3// SPDX-License-Identifier: LicenseRef-MarqueLicense-1.0
4
5//! Phase-D probabilistic [`Recognizer`] — the "decoder".
6//!
7//! This module implements the deep-scan half of the strict/deep-scan
8//! recognizer split introduced in Phase 4 PR-2. When the engine is
9//! configured for deep-scan (batch reconciliation mode,
10//! rule-escalated region, `--deep-scan` CLI flag), and the strict
11//! recognizer returns zero candidates for a marking region, the
12//! engine falls back to the decoder to recover mangled markings that
13//! are one of a small set of canonical-shape deviations away from a
14//! real CAPCO-2016 marking:
15//!
16//! - Edit-distance-1/2 token typos (`SERCET` → `SECRET`).
17//! - Token reordering within categories (`NOFORN//SECRET` →
18//! `SECRET//NOFORN`).
19//! - CAPCO-2016-superseded tokens (`COMINT` → `SI`).
20//! - Case mistakes (`secret//noforn` → `SECRET//NOFORN`).
21//! - Garbled delimiters (`S ∕∕ NOFORN` → `S//NOFORN`).
22//!
23//! The decoder never fabricates a marking where none exists. When the
24//! observed tokens fit no CAPCO grammar template, it returns
25//! `Parsed::Ambiguous { candidates: vec![] }` — the zero-candidate
26//! signal per foundational-plan line 609-612.
27//!
28//! ## Why this lives in `marque-engine`, not `marque-capco`
29//!
30//! Same Constitution VII rationale as `StrictRecognizer` (PR-2):
31//! `marque-capco` may not depend on `marque-core`, but the decoder
32//! needs core's fuzzy-vocab matcher and strict parser to materialize
33//! candidates. `marque-engine` is the sole crate where both chains
34//! converge. The original tasks.md T059/T061 placement is amended in
35//! tasks.md itself.
36//!
37//! ## Scoring approach (foundational-plan §5.2)
38//!
39//! For each candidate the decoder computes:
40//!
41//! ```text
42//! log_posterior(candidate | observed)
43//! = log_prior(candidate) // baked corpus priors (PR-1)
44//! + Σ log_likelihood(feature | candidate) // enumerated scored features
45//! ```
46//!
47//! The decoder currently scores the candidate-shape features it
48//! records from the closed [`FeatureId`] enum:
49//! `EditDistance1`, `EditDistance2`, `TokenReorder`,
50//! `SupersededToken`, and `BaseRateCommonMarking`. Each contributes
51//! a fixed log-odds delta documented at the feature's call site.
52//!
53//! [`FeatureId::StrictContextClassification`] is part of the audit-
54//! schema enum but is **not** currently a scored-feature term:
55//! classification-level context is enforced through the separate
56//! [`ParseContext::classification_floor`] hard filter (FR-011),
57//! which rejects below-floor candidates before scoring rather than
58//! adding a likelihood term to the posterior. [`FeatureId::CorpusOverrideInEffect`]
59//! is reserved for PR-5 when corpus-override is wired; the decoder
60//! does not emit it today. Turning either into an actual scored
61//! contributor requires a coordinated audit-schema bump
62//! (`MARQUE_AUDIT_SCHEMA`) per `marque-rules/src/confidence.rs` doc.
63//!
64//! The top candidate wins when its posterior exceeds the runner-up by
65//! a configured ratio; below that threshold the decoder returns
66//! `Parsed::Ambiguous { candidates }` so the engine can surface a
67//! diagnostic rather than auto-apply. `Candidate::prior_log_odds`
68//! carries the prior alone (sum of token log-priors); the
69//! per-feature log-odds deltas live only in
70//! `Candidate::evidence[i].log_odds`, so a resolver that reconstructs
71//! `prior_log_odds + Σ evidence.log_odds` recovers the decoder's
72//! internal posterior exactly, without double-counting.
73//!
74//! ## What this module is NOT
75//!
76//! - Not a full template-matching grammar engine. The MVP materializes
77//! candidates by canonicalizing observed tokens and round-tripping
78//! through the strict parser — the strict parser is the arbiter of
79//! "is this a CAPCO-shape marking." If the canonicalized bytes
80//! strict-parse, we have a candidate; if not, we discard.
81//! - Not a learning system. All priors are compile-time-baked `&'static`
82//! tables from `marque_capco::priors` (Constitution III: no runtime
83//! corpus override on WASM).
84//! - Not a fix applier. The decoder proposes `CapcoMarking` candidates;
85//! the engine applies them through the normal `Diagnostic` /
86//! `FixProposal` path with `FixSource::DecoderPosterior`.
87
88use std::collections::BTreeSet;
89
90use marque_capco::provenance::DecoderProvenance;
91use marque_capco::{CapcoMarking, CapcoScheme};
92use marque_core::{Parser, fuzzy::FuzzyVocabMatcher};
93use marque_ism::{
94 CapcoTokenSet, Classification, SciControl, SciControlBare, SciControlSystem,
95 span::{MarkingCandidate, MarkingType, Span},
96 token_set::TokenSet as _,
97};
98use marque_rules::confidence::{FeatureContribution, FeatureId};
99use marque_scheme::ambiguity::{Candidate, EvidenceFeature, Parsed};
100use marque_scheme::recognizer::{ParseContext, Recognizer};
101
102use crate::recognizer::{StrictRecognizer, is_us_restricted};
103
104/// K=8 candidate bound per foundational-plan §5.2 and research.md R3.
105///
106/// Higher K burns latency without accuracy gain (diminishing returns
107/// above 6 per the primary-source corpus analysis); lower K drops
108/// recall on multi-token reorderings. Tunable in-place — the bound is
109/// advisory, not a correctness invariant.
110const K_MAX_CANDIDATES: usize = 8;
111
112/// Runner-up posterior-ratio threshold for emitting `Unambiguous`.
113///
114/// The decoder computes `log_margin = top_posterior - runner_up_posterior`
115/// in natural-log space. When `log_margin >= UNAMBIGUOUS_LOG_MARGIN`,
116/// the decoder collapses to `Unambiguous(top)`; below the threshold it
117/// returns `Ambiguous { candidates }` so the engine can surface a
118/// diagnostic rather than auto-apply a close call.
119///
120/// `1.6` corresponds to a posterior odds ratio of `e^1.6 ≈ 4.95` —
121/// i.e., the top candidate is roughly five times as likely as the
122/// runner-up given the observed bytes. This is the **odds** ratio
123/// (`P(top)/P(runner_up)`), not a probability ratio.
124const UNAMBIGUOUS_LOG_MARGIN: f32 = 1.6;
125
126/// Phase-D probabilistic marking recognizer.
127///
128/// Stateless — all priors are baked `&'static` tables consumed at
129/// scoring time. Cheaply constructible; the engine holds a single
130/// instance behind `Arc` for the lifetime of one [`crate::Engine`].
131///
132/// When `ParseContext::strict_evidence == true` the decoder defers to
133/// the strict path by returning a zero-candidate result. The engine
134/// is responsible for calling the strict recognizer first and only
135/// invoking the decoder on deep-scan regions (see
136/// `crate::Engine::lint` dispatch).
137#[derive(Debug, Default, Clone, Copy)]
138pub struct DecoderRecognizer;
139
140impl DecoderRecognizer {
141 /// Construct a decoder recognizer.
142 pub const fn new() -> Self {
143 Self
144 }
145}
146
147impl Recognizer<CapcoScheme> for DecoderRecognizer {
148 fn recognize(&self, bytes: &[u8], cx: &ParseContext) -> Parsed<CapcoMarking> {
149 // Strict-path callers get zero candidates so the engine's
150 // strict recognizer remains the authoritative answer under
151 // interactive-authoring latency (SC-001). The engine only
152 // invokes the decoder when `strict_evidence = false` is
153 // explicitly requested (deep-scan mode or rule-escalated
154 // region).
155 if cx.strict_evidence {
156 return Parsed::Ambiguous {
157 candidates: Vec::new(),
158 };
159 }
160
161 let Some(kind) = infer_marking_type(bytes) else {
162 return Parsed::Ambiguous {
163 candidates: Vec::new(),
164 };
165 };
166
167 // Prose-glue suppression: a single-letter portion candidate
168 // (`(s)`, `(c)`, `(u)`, `(r)`, …) immediately glued to a
169 // preceding word — `letter(s)`, `function(c)`, `loss(s)` —
170 // is overwhelmingly a plural-suffix or function-call-shaped
171 // prose glyph, not a real CAPCO marking. The strict recognizer
172 // doesn't have the surrounding-byte context to tell these
173 // apart; the engine populates `cx.preceded_by_whitespace`
174 // from the source byte preceding the candidate's span and
175 // hands it to the decoder so this fallback path doesn't
176 // resurrect the false positive that the strict path would
177 // never have produced (the strict parser is case-sensitive
178 // and rejects lowercase tokens, so `(s)` only reaches the
179 // decoder via the case-fold canonicalization).
180 //
181 // Bullets and numbered-list markers are not a problem — they
182 // always have whitespace between the bullet and the marking
183 // (`1. (S)`, `* (S//NF)`, `(a) (S)` all set
184 // `preceded_by_whitespace = true`).
185 if !cx.preceded_by_whitespace
186 && matches!(kind, MarkingType::Portion)
187 && is_single_letter_portion(bytes)
188 {
189 return Parsed::Ambiguous {
190 candidates: Vec::new(),
191 };
192 }
193
194 // 1. Canonicalize the observed bytes into zero-or-more
195 // candidate byte-strings + per-candidate feature trace.
196 let canonical_attempts = generate_candidate_bytes(bytes);
197 if canonical_attempts.is_empty() {
198 return Parsed::Ambiguous {
199 candidates: Vec::new(),
200 };
201 }
202
203 // 2. Strict-parse each canonicalized attempt. Anything that
204 // fails strict parsing is discarded — the strict parser is
205 // the arbiter of "is this a CAPCO-shape marking." This is
206 // what guarantees the decoder never fabricates a marking
207 // shape the grammar forbids.
208 let token_set = CapcoTokenSet;
209 let parser = Parser::new(&token_set);
210 let synthetic_candidate = MarkingCandidate {
211 span: Span::new(0, 0), // re-set per attempt below
212 kind,
213 };
214 let mut scored: Vec<ScoredCandidate> = Vec::new();
215 for attempt in canonical_attempts {
216 let candidate = MarkingCandidate {
217 span: Span::new(0, attempt.bytes.len()),
218 ..synthetic_candidate
219 };
220 let Ok(mut parsed) = parser.parse(&candidate, &attempt.bytes) else {
221 continue;
222 };
223
224 // 3a. Reject partial canonicalizations. Any
225 // `TokenKind::Unknown` span surviving strict parse of
226 // the canonicalized bytes means the decoder passed an
227 // uncorrectable token through unchanged (see Case 4
228 // in `fuzzy_correct_tokens`). Accepting such a
229 // candidate would silently drop the unknown token
230 // from `token_spans` in step 3b and fabricate a
231 // partial marking — e.g., `(SECRET//WIBBLE)` would
232 // land as `classification: Some(Secret)` with
233 // WIBBLE simply discarded. The correct behavior is
234 // to discard the candidate so the decoder's output
235 // set stays honest: either a token fully resolves or
236 // the whole candidate goes away.
237 let has_unknown_token = parsed
238 .attrs
239 .token_spans
240 .iter()
241 .any(|s| matches!(s.kind, marque_ism::TokenKind::Unknown));
242 if has_unknown_token {
243 continue;
244 }
245
246 // 3b. Span-offset contract: `IsmAttributes::token_spans`
247 // returned by the strict parser carry offsets into
248 // `attempt.bytes` (the canonicalized buffer), NOT the
249 // original `bytes` slice the caller passed to
250 // `recognize()`. Propagating those spans would
251 // violate the [`Recognizer`] contract — "spans are by
252 // offset into [the input] buffer" — and misplace
253 // downstream diagnostics/fixes whenever
254 // canonicalization changed spacing, delimiter form,
255 // token order, or token length (e.g., `COMINT` → `SI`
256 // changes a 6-byte token to 2 bytes). Until we have a
257 // proper source↔canonical span map, decoder-produced
258 // markings must not carry token spans; downstream
259 // CAPCO rules that consume `attrs.token_spans` fall
260 // back to marking-level spans for decoder fixes.
261 //
262 // Clearing happens AFTER the Unknown-token check
263 // above — we need the spans to filter partial
264 // canonicalizations, but must drop them before the
265 // marking leaves the decoder.
266 parsed.attrs.token_spans = Box::new([]);
267 let marking = CapcoMarking::new(parsed.attrs);
268
269 // 3c. The strict parser is lenient — it accepts any
270 // `BYTES//BYTES` shape and emits an `IsmAttributes`
271 // with empty fields when nothing is recognized. Drop
272 // such trivial parses so the decoder doesn't
273 // fabricate a marking for prose like `FROBNITZ//WIBBLE`.
274 if !is_nontrivial_marking(&marking) {
275 continue;
276 }
277
278 // 3c-bis. Reject `Us(Restricted)` markings. Same rationale
279 // as the strict recognizer (see [`is_us_restricted`]):
280 // RESTRICTED is by definition a non-US classification,
281 // so any candidate the parser landed on the US axis
282 // is invalid regardless of what other tokens
283 // (`fgi_marker`, dissem controls, REL TO) accompany
284 // it. Real foreign-origin RESTRICTED markings parse
285 // to `Fgi(...)` / `Nato(...)` / `Joint(...)` and
286 // pass through.
287 if is_us_restricted(&marking) {
288 continue;
289 }
290
291 // 3d. FR-011 — drop candidates below the page's strict
292 // classification floor.
293 if let Some(floor) = cx.classification_floor
294 && !meets_classification_floor(&marking, floor)
295 {
296 continue;
297 }
298
299 // 3e. Portion/Banner shapes REQUIRE a classification to
300 // be a meaningful marking. The strict parser is
301 // lenient — `(YS//NF)` parses to a marking with
302 // `classification: None, dissem_controls: [Nf]`
303 // because `YS` doesn't resolve to any
304 // [`Classification`] variant. The decoder's
305 // bag-of-tokens scorer rewards FEWER negative-log-
306 // prior tokens, so without this filter the
307 // no-classification candidate would outrank a
308 // heuristic-corrected `(TS//NF)` candidate that
309 // contributed both `TOP SECRET` and `NF` priors.
310 //
311 // For CAB shapes the analogous completeness check
312 // is "any of classified_by / derived_from /
313 // declassify_on / declass_exemption is set" —
314 // [`is_nontrivial_marking`] above already covers
315 // that for the CAB code path. For
316 // [`MarkingType::PageBreak`] this filter is
317 // intentionally a no-op: page breaks are control
318 // shapes the decoder shouldn't be asked to recover.
319 if matches!(kind, MarkingType::Portion | MarkingType::Banner)
320 && marking.0.classification.is_none()
321 {
322 continue;
323 }
324
325 // 4. Score: compute prior and posterior separately. The
326 // prior is the sum of baked corpus log-priors over the
327 // marking's canonical tokens; the posterior is the
328 // prior plus the per-feature log-odds deltas recorded
329 // during canonicalization. `Candidate::prior_log_odds`
330 // is documented as the prior alone (see
331 // `crates/scheme/src/ambiguity.rs`) and is combined
332 // additively with `EvidenceFeature.log_odds` by any
333 // downstream resolver — storing the full posterior
334 // there would double-count the features once the
335 // resolver re-adds them. Internal decoder sort /
336 // threshold decisions use the posterior.
337 let (prior, posterior) = score_candidate(&attempt, &marking);
338 scored.push(ScoredCandidate {
339 marking,
340 prior,
341 posterior,
342 canonical_bytes: attempt.bytes.into_boxed_slice(),
343 features: attempt.features,
344 fix_source: attempt.fix_source,
345 });
346 }
347
348 if scored.is_empty() {
349 return Parsed::Ambiguous {
350 candidates: Vec::new(),
351 };
352 }
353
354 // 5. Drop any candidate with a non-finite posterior, sort
355 // descending, keep top K=8.
356 //
357 // NaN posteriors should be impossible —
358 // `MISSING_TOKEN_LOG_PRIOR = -12.0` and every feature delta
359 // is a finite constant — but a future scoring change could
360 // introduce a NaN-producing codepath. Under `f32::total_cmp`
361 // with the descending comparator (`b.total_cmp(&a)`), `+NaN`
362 // would sort *ahead* of every finite posterior and become the
363 // "top" candidate — its NaN posterior would then propagate
364 // into `log_margin` and `DecoderProvenance::posterior`, where
365 // `Confidence::validate` would later panic at audit-record
366 // promotion. Filter non-finite candidates out before the sort
367 // so the dispatch can never see one.
368 //
369 // `debug_assert` keeps the original assumption (decoder code
370 // does not produce NaN today) loud in dev builds; the filter
371 // is the production safeguard for if that assumption ever
372 // breaks silently.
373 debug_assert!(
374 scored.iter().all(|c| c.posterior.is_finite()),
375 "decoder produced non-finite posterior — invariant violated"
376 );
377 scored.retain(|c| c.posterior.is_finite());
378 if scored.is_empty() {
379 return Parsed::Ambiguous {
380 candidates: Vec::new(),
381 };
382 }
383 scored.sort_by(|a, b| b.posterior.total_cmp(&a.posterior));
384 scored.truncate(K_MAX_CANDIDATES);
385
386 // 6. Decision: top-over-runner-up log margin on the posterior.
387 let top_score = scored[0].posterior;
388 let runner_up_score = scored
389 .get(1)
390 .map(|c| c.posterior)
391 .unwrap_or(f32::NEG_INFINITY);
392 let log_margin = top_score - runner_up_score;
393
394 if scored.len() == 1 || log_margin >= UNAMBIGUOUS_LOG_MARGIN {
395 // Move the top candidate out so we can hand `canonical_bytes`
396 // and `features` directly to provenance without an extra
397 // clone — the marking carries the heaviest payload and we
398 // only need it once.
399 let top = scored.swap_remove(0);
400 // `runner_up_ratio = exp(log_margin)`, but a sufficiently
401 // separated top vs. runner-up overflows `f32::exp()` to
402 // `+∞` (anything past `log_margin ≈ 88.7` saturates), and
403 // `Confidence::validate` would then reject the resulting
404 // record as non-finite — making `FixProposal::new` panic at
405 // the audit boundary on extreme score separations. Saturate
406 // at `f32::MAX` so the audit record carries "the ratio is
407 // enormous" instead of crashing the engine.
408 let runner_up_ratio = if runner_up_score.is_finite() {
409 let ratio = log_margin.exp();
410 Some(if ratio.is_finite() { ratio } else { f32::MAX })
411 } else {
412 None
413 };
414 let mut marking = top.marking;
415 marking.1 = Some(DecoderProvenance::new(
416 top.canonical_bytes,
417 top.posterior,
418 runner_up_ratio,
419 top.features
420 .into_iter()
421 .map(|f| FeatureContribution {
422 id: f.id,
423 delta: f.delta,
424 })
425 .collect::<Vec<_>>()
426 .into_boxed_slice(),
427 top.fix_source,
428 ));
429 return Parsed::Unambiguous(marking);
430 }
431
432 // Ambiguous: return the whole K-truncated set with per-feature
433 // evidence so the engine can surface a user-visible diagnostic.
434 // `prior_log_odds` carries the prior alone; `evidence` carries
435 // the feature deltas. A resolver that re-computes the
436 // posterior as `prior + Σ evidence.log_odds` reproduces the
437 // decoder's internal score without double-counting.
438 Parsed::Ambiguous {
439 candidates: scored
440 .into_iter()
441 .map(|s| Candidate {
442 marking: s.marking,
443 evidence: s.features.iter().map(feature_entry_to_evidence).collect(),
444 prior_log_odds: s.prior,
445 })
446 .collect(),
447 }
448 }
449}
450
451/// One scored candidate kept in the decoder's working set.
452///
453/// `prior` and `posterior` are tracked separately so
454/// `Candidate::prior_log_odds` can carry the prior alone (per the
455/// trait-level contract in `crates/scheme/src/ambiguity.rs`) while
456/// internal sort / threshold decisions use the posterior.
457struct ScoredCandidate {
458 marking: CapcoMarking,
459 /// Sum of baked corpus log-priors over the marking's canonical
460 /// tokens. No feature deltas included.
461 prior: f32,
462 /// `prior + Σ feature.delta`. Used for sorting and threshold
463 /// comparisons inside the decoder; not stored in the emitted
464 /// `Candidate` record.
465 posterior: f32,
466 /// Canonical byte string the strict parser accepted for this
467 /// candidate. Threaded into [`DecoderProvenance::canonical_bytes`]
468 /// when this candidate wins the Unambiguous collapse, so the
469 /// engine can emit the decoder fix from the original mangled
470 /// bytes to this canonical form (Phase 4 PR-4b, T068).
471 canonical_bytes: Box<[u8]>,
472 features: Vec<FeatureEntry>,
473 /// Provenance discriminator carried from the originating
474 /// [`CanonicalAttempt`]. The engine maps this to
475 /// [`Severity::Fix`](marque_rules::Severity::Fix) for
476 /// `DecoderPosterior` and
477 /// [`Severity::Warn`](marque_rules::Severity::Warn) for
478 /// `DecoderClassificationHeuristic` (issue #133 PR 2).
479 fix_source: marque_rules::FixSource,
480}
481
482/// One feature recorded during candidate generation, paired with its
483/// log-odds contribution. The decoder accumulates these to reconstruct
484/// `Confidence::features` at audit-emit time.
485#[derive(Debug, Clone, Copy)]
486struct FeatureEntry {
487 id: FeatureId,
488 delta: f32,
489}
490
491/// Project a `FeatureEntry` onto the wire-shape [`EvidenceFeature`].
492///
493/// Routes the label through [`FeatureId::as_str`] — the single source
494/// of truth for the FeatureId → audit-record-string registry declared
495/// in `crates/rules/src/confidence.rs`. Lifted out of the inline
496/// closure in [`DecoderRecognizer::recognize`] so the projection is
497/// directly testable: a divergent local label registry (the PR #142 H2
498/// pre-fix shape) would now fail
499/// [`tests::feature_entry_to_evidence_uses_canonical_label_registry`]
500/// rather than going unnoticed because the dispatcher discards
501/// `Parsed::Ambiguous` results today.
502fn feature_entry_to_evidence(f: &FeatureEntry) -> EvidenceFeature {
503 EvidenceFeature {
504 label: f.id.as_str(),
505 log_odds: f.delta,
506 }
507}
508
509/// A canonicalization attempt: the byte string the decoder will hand
510/// to the strict parser, plus the features that transformation
511/// represents. Zero or more attempts are generated per observed input.
512struct CanonicalAttempt {
513 bytes: Vec<u8>,
514 features: Vec<FeatureEntry>,
515 /// Which decoder path produced this attempt. Defaults to
516 /// [`marque_rules::FixSource::DecoderPosterior`] for the standard
517 /// vocab-based pipeline (delimiter normalization, fuzzy
518 /// correction, token reorder, superseded-token replacement).
519 /// The position-aware classification heuristic emits attempts
520 /// with [`marque_rules::FixSource::DecoderClassificationHeuristic`]
521 /// (issue #133 PR 2) so the engine can downgrade to
522 /// [`marque_rules::Severity::Warn`] and cap
523 /// [`marque_rules::Confidence::rule`].
524 fix_source: marque_rules::FixSource,
525}
526
527// ---------------------------------------------------------------------------
528// Marking-type inference (mirrors `recognizer::infer_marking_type`)
529// ---------------------------------------------------------------------------
530
531/// Infer a [`MarkingType`] from the shape of `bytes`.
532///
533/// Same heuristic as the strict recognizer — portion on leading `(`,
534/// CAB on authority-head prefix, banner otherwise. Lives locally so
535/// the decoder doesn't need to poke into `StrictRecognizer` internals.
536fn infer_marking_type(bytes: &[u8]) -> Option<MarkingType> {
537 let first = bytes.iter().copied().find(|&b| !b.is_ascii_whitespace())?;
538 if first == b'(' {
539 return Some(MarkingType::Portion);
540 }
541 if is_cab_head(bytes) {
542 return Some(MarkingType::Cab);
543 }
544 Some(MarkingType::Banner)
545}
546
547fn is_cab_head(bytes: &[u8]) -> bool {
548 let Ok(text) = std::str::from_utf8(bytes) else {
549 return false;
550 };
551 let trimmed = text.trim_start();
552 trimmed.starts_with("Classified By:")
553 || trimmed.starts_with("Derived From:")
554 || trimmed.starts_with("Declassify On:")
555}
556
557// ---------------------------------------------------------------------------
558// Candidate byte generation
559// ---------------------------------------------------------------------------
560
561/// Generate bounded canonical-byte candidates from a mangled input.
562///
563/// Each returned [`CanonicalAttempt`] is a `Vec<u8>` the decoder will
564/// hand to the strict parser. Attempts cover the transforms named in
565/// the module docs:
566///
567/// - Case normalization (`secret//noforn` → `SECRET//NOFORN`).
568/// - Garbled-delimiter rewrite (`S ∕∕ NOFORN` → `S//NOFORN`).
569/// - Per-token fuzzy correction (edit-distance ≤ 2 via
570/// [`marque_core::fuzzy::FuzzyVocabMatcher`]).
571/// - Superseded-token replacement (`COMINT` → `SI`).
572/// - Token reordering — tried when categorical ordering is the obvious
573/// deviation (e.g., portion `A//B` where B is a classification and
574/// A isn't).
575///
576/// Bounded by [`K_MAX_CANDIDATES`] × 2 to keep the strict-parse pass
577/// bounded; duplicates (different feature traces producing the same
578/// canonical bytes) are deduplicated at emit time.
579fn generate_candidate_bytes(bytes: &[u8]) -> Vec<CanonicalAttempt> {
580 let Ok(text) = std::str::from_utf8(bytes) else {
581 return Vec::new();
582 };
583
584 // Strip surrounding whitespace; preserve leading `(` for portion
585 // detection so the strict parser's portion path stays keyed off
586 // the same first-non-whitespace byte the recognizer saw.
587 let trimmed = text.trim();
588 if trimmed.is_empty() {
589 return Vec::new();
590 }
591
592 let mut attempts: Vec<CanonicalAttempt> = Vec::new();
593 let mut emit =
594 |bytes: Vec<u8>, features: Vec<FeatureEntry>, fix_source: marque_rules::FixSource| {
595 // Hard cap at K_MAX_CANDIDATES × 2 — guarantees the strict-parse
596 // work downstream is bounded even if new transform stages are added.
597 if attempts.len() >= K_MAX_CANDIDATES * 2 {
598 return;
599 }
600 // Dedup by the canonical byte string — different transform
601 // sequences can converge on the same output. Emit-first wins:
602 // the standard vocab-based attempts are emitted before the
603 // heuristic attempt, so a heuristic candidate with bytes that
604 // converge on a vocab-based result is dropped here, preserving
605 // the more authoritative `FixSource::DecoderPosterior`
606 // provenance.
607 if !attempts.iter().any(|a| a.bytes == bytes) {
608 attempts.push(CanonicalAttempt {
609 bytes,
610 features,
611 fix_source,
612 });
613 }
614 };
615
616 // ---- Raw: just trim + normalize delimiters/case. --------------
617 let (normalized, mut delim_features) = normalize_delimiters_and_case(trimmed);
618
619 // ---- REL TO structural repair (issue #133 PR 9) — applied as
620 // PREPROCESSING (before fuzzy correction) rather than as a
621 // competing candidate emission. All four PR-9 patterns are
622 // safe to apply unconditionally:
623 //
624 // - Patterns 1/2 (`REL OT ` / `RELT O ` → `REL TO `) are
625 // literal-shape transforms. Neither pattern appears in any
626 // valid CAPCO text — REL has exactly two valid extensions
627 // (`REL TO` and `RELIDO`) — so the byte replacement is
628 // collision-free.
629 // - Patterns 3/4 (`A US` → `AUS`, `AU,S ` → `AUS, `) are
630 // trigraph-guarded inside a `REL TO ` block: the fix only
631 // fires when the joined 3-letter string is a known trigraph
632 // AND the shorter prefix alone is not, so a false positive
633 // would require the trigraph dictionary itself to disagree
634 // with reality.
635 //
636 // Applying as preprocessing avoids two scoring problems that
637 // a separate-candidate emission would hit: (a) fuzzy
638 // correction would silently rewrite `RELT` → `REL` before
639 // pattern 2's header normalize could fire, and (b) REL TO
640 // trigraphs do NOT contribute to the prior in
641 // `canonical_tokens_for` (only classification, SCI, dissem,
642 // NIC, AEA, FGI do — see issue #186 for the corpus-weighted
643 // trigraph priors followup), so a separate fix candidate
644 // would tie with the raw on prior and lose on emit-order.
645 // Preprocessing eliminates the competing-raw-candidate
646 // problem entirely.
647 //
648 // When structural repair fires, push a `BaseRateCommonMarking`
649 // feature onto `delim_features` so every candidate derived
650 // from the repaired text inherits the marker. This mirrors
651 // `try_insert_delimiter` and `try_sar_indicator_repair`
652 // (which add their own per-candidate `BaseRateCommonMarking`)
653 // and ensures the audit/provenance trace reflects that the
654 // input required cleanup beyond delimiter/case normalization.
655 // No dedicated `FeatureId` for structural repair exists in
656 // the audit schema (`marque-mvp-2`); reusing
657 // `BaseRateCommonMarking` keeps the schema closed and
658 // composes additively with the other normalization paths
659 // that share the same id.
660 let repaired_text = match try_rel_to_structural_repair(&normalized) {
661 Some(repaired) => {
662 delim_features.push(FeatureEntry {
663 id: FeatureId::BaseRateCommonMarking,
664 delta: -0.3,
665 });
666 repaired
667 }
668 None => normalized,
669 };
670
671 // ---- SCI delimiter repair (issue #198, #133 PR 10). Same
672 // preprocessing-shape as the REL TO repair above: rewrites
673 // concatenated CVE compounds (`HCSP → HCS-P`), missing
674 // slashes between bare control systems (`SITK → SI/TK`), and
675 // wrong-delimiter cases (`SI-TK → SI/TK`). All targets live
676 // in `CVEnumISMSCIControls.xml` — no agency vocab. Sub-
677 // compartments and unregistered compartments are out of
678 // scope (issue #180). Push a `BaseRateCommonMarking`
679 // penalty for the same reason as REL TO repair: a candidate
680 // that arrived clean should outrank one that needed
681 // structural cleanup when both produce the same shape.
682 let repaired_text = match try_sci_delimiter_repair(&repaired_text) {
683 Some(repaired) => {
684 delim_features.push(FeatureEntry {
685 id: FeatureId::BaseRateCommonMarking,
686 delta: -0.3,
687 });
688 repaired
689 }
690 None => repaired_text,
691 };
692
693 // ---- Per-token fuzzy correction on the repaired text. --------
694 let vocab = CapcoTokenSet.correction_vocab();
695 let matcher = FuzzyVocabMatcher::new(vocab);
696 let (fuzzy_corrected, fuzzy_features) = fuzzy_correct_tokens(&repaired_text, &matcher);
697
698 // Emit the straightforward "normalize + fuzzy-correct" attempt
699 // first — this covers typos (T046) and case/delimiter mangling
700 // by default.
701 let mut features = delim_features.clone();
702 features.extend(fuzzy_features.iter().copied());
703 emit(
704 fuzzy_corrected.clone().into_bytes(),
705 features,
706 marque_rules::FixSource::DecoderPosterior,
707 );
708
709 // ---- Also attempt a token-reorder pass. The reorder is gentle:
710 // inside each `//`-separated segment, if the segment's tokens
711 // look like they belong to multiple categories, we try a
712 // canonical category ordering (classification first).
713 if let Some(reordered) = try_canonical_reorder(&fuzzy_corrected) {
714 let mut features = delim_features.clone();
715 features.extend(fuzzy_features.iter().copied());
716 features.push(FeatureEntry {
717 id: FeatureId::TokenReorder,
718 delta: -0.4,
719 });
720 emit(
721 reordered.into_bytes(),
722 features,
723 marque_rules::FixSource::DecoderPosterior,
724 );
725 }
726
727 // ---- Non-US prefix insertion. For bare non-US markings that
728 // arrive with no `//` at all (e.g., `NS`, `JOINT S GBR USA`,
729 // `CAN S`), emit a `//{body}` candidate so the strict parser
730 // enters the non-US classification code path. The reorder pass
731 // above handles inputs that already contain `//` but are
732 // missing the leading empty-US-slot prefix.
733 if let Some(prefixed) = try_add_non_us_prefix(&fuzzy_corrected) {
734 let mut features = delim_features.clone();
735 features.extend(fuzzy_features.iter().copied());
736 features.push(FeatureEntry {
737 id: FeatureId::TokenReorder,
738 delta: -0.4,
739 });
740 emit(
741 prefixed.into_bytes(),
742 features,
743 marque_rules::FixSource::DecoderPosterior,
744 );
745 }
746
747 // ---- Missing-delimiter insertion (issue #133 PR 3). Walks the
748 // fuzzy-corrected text, inserts `//` at category-transition
749 // whitespace gaps. Tagged with `FixSource::DecoderPosterior`
750 // because the recovery is structural (missing punctuation),
751 // not a probabilistic guess like the classification heuristic
752 // below — auto-applies at default threshold when its strict
753 // parse + scoring outranks competing candidates.
754 if let Some(delim_inserted) = try_insert_delimiter(&fuzzy_corrected) {
755 let mut features = delim_features.clone();
756 features.extend(fuzzy_features.iter().copied());
757 // No FeatureId for delimiter insertion in the audit schema.
758 // Reuse `BaseRateCommonMarking` with a small negative delta
759 // to record that this attempt required cleanup beyond the
760 // raw input — keeps the canonical-arrived-clean attempt
761 // ranked higher when both produce the same shape.
762 features.push(FeatureEntry {
763 id: FeatureId::BaseRateCommonMarking,
764 delta: -0.3,
765 });
766 emit(
767 delim_inserted.into_bytes(),
768 features,
769 marque_rules::FixSource::DecoderPosterior,
770 );
771 }
772
773 // ---- SAR indicator-keyword structural repair (issue #133 PR 6).
774 // Recovers `USAR-BP-J12...` (stray prefix on the SAR
775 // indicator) and `SARBP` (missing hyphen between indicator
776 // and program identifier). Same provenance / penalty story
777 // as `try_insert_delimiter`: a `BaseRateCommonMarking` delta
778 // records that the candidate required cleanup beyond raw
779 // input, so a canonical-arrived-clean candidate beats a
780 // SAR-repaired one with the same final shape.
781 if let Some(sar_repaired) = try_sar_indicator_repair(&fuzzy_corrected) {
782 let mut features = delim_features.clone();
783 features.extend(fuzzy_features.iter().copied());
784 features.push(FeatureEntry {
785 id: FeatureId::BaseRateCommonMarking,
786 delta: -0.3,
787 });
788 emit(
789 sar_repaired.into_bytes(),
790 features,
791 marque_rules::FixSource::DecoderPosterior,
792 );
793 }
794
795 // ---- Stray-character `/X/` recovery (issue #133 PR 7). Walks
796 // the fuzzy-corrected text looking for the pattern
797 // `<alnum>/<single_alnum_char>/<alnum>` — three transforms
798 // emitted per match (drop X, attach X to right token,
799 // attach X to left token). Step 3a's Unknown-token filter
800 // acts as the natural disambiguator: only the transform
801 // that produces a recognizable token survives. See
802 // [`try_collapse_stray_char_slash`] for the recovery
803 // shapes (`SI/U/NOFORN` → drop, `SI/N/OFORN` →
804 // right-attach, `SECRE/T/REL TO` → left-attach).
805 for candidate in try_collapse_stray_char_slash(&fuzzy_corrected) {
806 let mut features = delim_features.clone();
807 features.extend(fuzzy_features.iter().copied());
808 features.push(FeatureEntry {
809 id: FeatureId::BaseRateCommonMarking,
810 delta: -0.3,
811 });
812 emit(
813 candidate.into_bytes(),
814 features,
815 marque_rules::FixSource::DecoderPosterior,
816 );
817 }
818
819 // ---- REL TO trigraph fuzzy-priors expansion (issue #233).
820 // The standard fuzzy path in `fuzzy_correct_tokens` operates
821 // against `correction_vocab()`, which deliberately excludes
822 // country trigraphs (see the comment on `ALL_CVE_TOKENS` in
823 // `crates/ism/build.rs` and the design rationale in
824 // `EXTENDED_CORRECTION_VOCAB`). Trigraphs live in a separate
825 // `TRIGRAPHS` slice reached via `is_trigraph`. So an unknown
826 // 3-char REL TO entry like `USB` doesn't get any fuzzy
827 // correction — the standard fuzzy walk has nothing to match
828 // against. The strict REL TO parser previously dropped
829 // unknown entries silently; issue #233 makes
830 // `parse_rel_to_with_spans` emit `TokenKind::Unknown` instead
831 // so the dispatcher's step 3a rejects the "drop USB"
832 // candidate.
833 //
834 // With unknown entries no longer silently absorbed, the
835 // candidate set must include real trigraph alternates for
836 // the dispatcher to choose between. This block walks each
837 // `REL TO ` block, finds 3-char entries that aren't valid
838 // trigraphs, and emits one canonical-byte alternate per
839 // candidate from a fuzzy match against the TRIGRAPHS
840 // slice. The structural strict parse +
841 // `score_candidate` (which sums `country_code_log_prior`
842 // over the parsed `rel_to` slice) then picks the right
843 // winner: USA dominates UZB by ~7 nats, far above
844 // `UNAMBIGUOUS_LOG_MARGIN`.
845 //
846 // Each alternate carries an `EditDistance1` /
847 // `EditDistance2` feature so the audit trail records the
848 // fuzzy work, plus a zero-delta `BaseRateCommonMarking`
849 // feature whose role is purely audit-trail provenance —
850 // "country-code priors were consulted on this candidate".
851 // The actual scoring weight comes from `score_candidate`
852 // summing `country_code_log_prior` over `attempt.rel_to`;
853 // adding a non-zero delta here would double-count. The
854 // other structural-cleanup paths in this file use `-0.3`
855 // because they have no parallel score-time prior to back
856 // them up; the trigraph path does, so the audit feature
857 // is informational only. No new `FeatureId` variant —
858 // adding one would bump the audit schema. Reusing
859 // `BaseRateCommonMarking` matches the variant's existing
860 // doc ("the candidate's base rate in the target corpus
861 // dominates the posterior").
862 let trigraph_matcher = FuzzyVocabMatcher::new(marque_ism::TRIGRAPHS);
863 for (alt_text, edit_feature) in
864 try_rel_to_fuzzy_trigraph_candidates(&fuzzy_corrected, &trigraph_matcher)
865 {
866 let mut features = delim_features.clone();
867 features.extend(fuzzy_features.iter().copied());
868 features.push(edit_feature);
869 // Trigraph-prior acknowledgement (see comment above for the
870 // FeatureId reuse rationale + zero-delta justification).
871 features.push(FeatureEntry {
872 id: FeatureId::BaseRateCommonMarking,
873 delta: 0.0,
874 });
875 emit(
876 alt_text.into_bytes(),
877 features,
878 marque_rules::FixSource::DecoderPosterior,
879 );
880 }
881
882 // ---- REL TO USA-injection for short first entries (issue #234 PR-B).
883 // Complementary to PR-A above: PR-A fuzzy-matches 3-char REL TO
884 // entries; PR-B handles 1-2 char first entries that are below
885 // `MIN_FUZZY_LEN`. The §H.8 p151 USA-first invariant gives us a
886 // strong structural signal that fuzzy matching cannot exploit
887 // on inputs that short — `SA → USA`, `S → USA`, etc. The
888 // `BaseRateCommonMarking` audit delta keeps the audit schema
889 // closed (no new `FeatureId` variant); see the doc on
890 // `try_rel_to_usa_injection_candidates` for the rationale.
891 for (alt_text, prior_feature) in try_rel_to_usa_injection_candidates(&fuzzy_corrected) {
892 let mut features = delim_features.clone();
893 features.extend(fuzzy_features.iter().copied());
894 features.push(prior_feature);
895 emit(
896 alt_text.into_bytes(),
897 features,
898 marque_rules::FixSource::DecoderPosterior,
899 );
900 }
901
902 // ---- Position-aware classification heuristic (issue #133 PR 2).
903 // Runs LAST so the dedup-keep-first guard above lets a
904 // vocab-based attempt with the same canonical bytes win the
905 // provenance contest — the heuristic only "wins" when no
906 // vocab path produces the same shape.
907 //
908 // Scoring intentionally adds NO `EditDistance1` penalty.
909 // The heuristic's value comes from RECOGNIZING a
910 // classification token where the vocab-only path would
911 // leave the slot as `classification: None`. The added prior
912 // contribution from the recognized classification (e.g.,
913 // `log_prior("TOP SECRET")`) is what should put the
914 // heuristic candidate ahead of the no-classification fuzzy
915 // fallback. An EditDistance penalty would push the
916 // heuristic candidate BELOW the no-classification candidate
917 // and the fuzzy one would win — defeating the heuristic's
918 // purpose. The audit-record provenance still distinguishes
919 // this path through `FixSource::DecoderClassificationHeuristic`.
920 if let Some(heuristic_bytes) = try_classification_heuristic_fix(&fuzzy_corrected) {
921 let mut features = delim_features.clone();
922 features.extend(fuzzy_features.iter().copied());
923 emit(
924 heuristic_bytes.into_bytes(),
925 features,
926 marque_rules::FixSource::DecoderClassificationHeuristic,
927 );
928 }
929
930 attempts
931}
932
933/// Diagnostic-only accessor exposing the canonicalized byte attempts
934/// the decoder generates from `bytes`. Returns one byte string per
935/// attempt, in emit order; feature traces and the internal
936/// [`CanonicalAttempt`] type are deliberately not surfaced — the
937/// diagnostic only needs the bytes the strict parser will see.
938///
939/// Gated by the `decoder-harness` feature so it does not appear in
940/// production builds. The single consumer is
941/// `crates/engine/tests/decoder_diagnostic.rs` (issue #133 root-cause
942/// tracing). Calling the real [`generate_candidate_bytes`] eliminates
943/// the drift class of bug a hand-rolled re-implementation in the
944/// diagnostic would carry.
945#[cfg(feature = "decoder-harness")]
946pub fn diagnostic_canonical_attempts(bytes: &[u8]) -> Vec<Vec<u8>> {
947 generate_candidate_bytes(bytes)
948 .into_iter()
949 .map(|a| a.bytes)
950 .collect()
951}
952
953/// Normalize delimiters and case on a trimmed input.
954///
955/// - Fullwidth slash variants (`∕∕`, `/ /`, ` / / `, spaced `//`) all
956/// collapse to `//`.
957/// - ASCII alphabetic characters are upper-cased; the CAPCO grammar
958/// is case-sensitive uppercase (§B).
959/// - Leading `(` and trailing `)` are preserved so portion detection
960/// still works.
961///
962/// Returns the normalized string and the features that were applied.
963/// When normalization was actually needed, a `BaseRateCommonMarking`
964/// feature is recorded with a negative delta — the candidate pays a
965/// small penalty for having required case- or delimiter-cleanup
966/// rather than arriving in canonical form. A candidate that
967/// normalized cleanly and also resolved its tokens via fuzzy
968/// correction will still outrank a candidate that arrived dirty,
969/// but a canonical-from-the-start candidate beats both.
970fn normalize_delimiters_and_case(text: &str) -> (String, Vec<FeatureEntry>) {
971 let mut features = Vec::new();
972
973 // Collapse fullwidth and spaced slash variants.
974 // The order matters: we want multi-char sequences first.
975 let mut normalized: String = text.to_owned();
976 let replacements = [
977 ("∕∕", "//"),
978 (" // ", "//"),
979 ("// ", "//"),
980 (" //", "//"),
981 ("/ / ", "//"),
982 (" / / ", "//"),
983 ("/ /", "//"),
984 ];
985 let mut delim_changed = false;
986 for (from, to) in replacements {
987 if normalized.contains(from) {
988 normalized = normalized.replace(from, to);
989 delim_changed = true;
990 }
991 }
992
993 // Case normalization. If the input was all-lowercase or mixed-case
994 // (Title Case), uppercasing is a significant canonicalization the
995 // decoder flags (via the `BaseRateCommonMarking` feature below)
996 // so the posterior reflects that the candidate required cleanup.
997 let had_lowercase = normalized.chars().any(|c| c.is_ascii_lowercase());
998 if had_lowercase {
999 normalized = normalized.to_ascii_uppercase();
1000 }
1001
1002 if delim_changed || had_lowercase {
1003 // Record a `BaseRateCommonMarking` feature with a penalty
1004 // delta. The feature doesn't fit into one of the sharper
1005 // features (`EditDistance*`, `TokenReorder`,
1006 // `SupersededToken`), but it flags that we had to massage
1007 // the input — delimiters were non-canonical, or case was
1008 // wrong. A small negative delta means a canonical-input
1009 // candidate outranks an otherwise-equivalent normalized one,
1010 // which is the intent: "arrives clean" should be preferred
1011 // over "needed cleanup."
1012 features.push(FeatureEntry {
1013 id: FeatureId::BaseRateCommonMarking,
1014 delta: -0.3,
1015 });
1016 }
1017
1018 (normalized, features)
1019}
1020
1021/// Fuzzy-correct each whitespace/delimiter-separated token in `text`.
1022///
1023/// Tokens that are already canonical are passed through. Unknown
1024/// tokens are run through [`FuzzyVocabMatcher`]; if a correction is
1025/// unambiguous the replacement lands in the output and the appropriate
1026/// `EditDistance1`/`EditDistance2` feature is recorded. If no
1027/// correction is available, the token is dropped into the output
1028/// unchanged.
1029///
1030/// Note on pass-through safety: `marque_core::Parser` is lenient — it
1031/// does NOT reject the whole parse when an unknown token appears, it
1032/// emits the token as a `TokenKind::Unknown` span instead. So
1033/// dropping an uncorrectable token through this step does not by
1034/// itself reject the candidate. The decoder's outer loop
1035/// (`DecoderRecognizer::recognize` step 3a) checks for any Unknown
1036/// span on the strict-parse result and discards such candidates
1037/// before they reach scoring — that is where partial-canonicalization
1038/// candidates get filtered out.
1039///
1040/// Also consults [`SUPERSEDED_TOKEN_MAP`] for CAPCO-2016 retirement
1041/// pairs (currently just `COMINT` → `SI`), recording the
1042/// `SupersededToken` feature when triggered.
1043fn fuzzy_correct_tokens(
1044 text: &str,
1045 matcher: &FuzzyVocabMatcher<'_>,
1046) -> (String, Vec<FeatureEntry>) {
1047 let mut features = Vec::new();
1048 let mut out = String::with_capacity(text.len());
1049 let mut rest = text;
1050
1051 // We walk the text segment-by-segment, preserving the `//`,
1052 // `-`, `(`, `)`, `,`, and whitespace delimiters verbatim. Tokens
1053 // are the maximal runs of ASCII alphanumerics (plus `-` when it
1054 // appears between alphanumerics, to keep compounds like `SI-G`
1055 // intact).
1056 while !rest.is_empty() {
1057 // Take the non-token prefix (delimiters/whitespace/punct).
1058 let non_token_len = rest
1059 .chars()
1060 .take_while(|c| !is_token_char(*c))
1061 .map(|c| c.len_utf8())
1062 .sum::<usize>();
1063 if non_token_len > 0 {
1064 out.push_str(&rest[..non_token_len]);
1065 rest = &rest[non_token_len..];
1066 continue;
1067 }
1068 // Take the token: alnum + internal `-`.
1069 let token_len = scan_token(rest);
1070 if token_len == 0 {
1071 // Should not happen given the non-token prefix branch,
1072 // but guard against infinite loops on pathological input.
1073 break;
1074 }
1075 let (token, tail) = rest.split_at(token_len);
1076 rest = tail;
1077
1078 // Case 1: exact superseded token (e.g., standalone `COMINT` → `SI`).
1079 if let Some(replacement) = SUPERSEDED_TOKEN_MAP
1080 .iter()
1081 .find(|&&(from, _)| from == token)
1082 .map(|&(_, to)| to)
1083 {
1084 out.push_str(replacement);
1085 features.push(FeatureEntry {
1086 id: FeatureId::SupersededToken,
1087 delta: -0.2,
1088 });
1089 continue;
1090 }
1091
1092 // Case 1b: embedded superseded token — the deprecated keyword
1093 // appears as a substring within a longer token. Handles compound
1094 // prefixes (`COMINT-G` → `SI-G`), embedded substitutions
1095 // (`UNCLASCOMINTFIED` → `UNCLASSIFIED`, `FRD-COMINTGMA 14` →
1096 // `FRD-SIGMA 14`, `SENCOMINTTIVE` → `SENSITIVE`). The token !=
1097 // from guard ensures the exact-match case above is the only path
1098 // for bare superseded tokens. CAPCO-2016 §H.4 p74.
1099 let embedded_replacement = SUPERSEDED_TOKEN_MAP
1100 .iter()
1101 .find(|&&(from, _)| token != from && token.contains(from))
1102 .map(|&(from, to)| token.replace(from, to));
1103 if let Some(replaced) = embedded_replacement {
1104 out.push_str(&replaced);
1105 features.push(FeatureEntry {
1106 id: FeatureId::SupersededToken,
1107 delta: -0.2,
1108 });
1109 continue;
1110 }
1111
1112 // Case 2: already canonical (known CVE token or trigraph).
1113 // Check this first so we don't run a vocab scan + edit-
1114 // distance pass on tokens we already recognize.
1115 if CapcoTokenSet.canonicalize(token).is_some() || CapcoTokenSet.is_trigraph(token) {
1116 out.push_str(token);
1117 continue;
1118 }
1119
1120 // Case 3: fuzzy-correctable. Compute once and reuse; the
1121 // previous structure called `matcher.correct(token)` twice
1122 // on tokens that weren't already canonical, doubling the
1123 // vocab-scan cost on exactly the unknown-token hot path.
1124 if let Some(correction) = matcher.correct(token) {
1125 out.push_str(correction.token);
1126 // `FeatureId` is part of the audit-schema contract (see
1127 // `crates/rules/src/confidence.rs` and the
1128 // `MARQUE_AUDIT_SCHEMA` pin); a wildcard `_` arm on it
1129 // would silently absorb future-variant additions. Pair
1130 // each (id, delta) directly off `correction.distance` so
1131 // both arms are total over the only two outcomes the
1132 // outer guard permits (`distance > 0`, `distance <=
1133 // MAX_EDIT_DISTANCE = 2`).
1134 let feature = match correction.distance {
1135 // `correct` returns `None` for exact matches, so
1136 // `distance == 0` cannot reach here; `MAX_EDIT_DISTANCE
1137 // == 2` upstream caps `distance <= 2`.
1138 0 => None,
1139 1 => Some(FeatureEntry {
1140 id: FeatureId::EditDistance1,
1141 delta: -0.5,
1142 }),
1143 _ => Some(FeatureEntry {
1144 id: FeatureId::EditDistance2,
1145 delta: -1.2,
1146 }),
1147 };
1148 if let Some(entry) = feature {
1149 features.push(entry);
1150 }
1151 continue;
1152 }
1153
1154 // Case 4: unknown and uncorrectable. Pass through verbatim.
1155 // The strict parser will register this as a
1156 // `TokenKind::Unknown` span rather than failing the parse
1157 // outright, so the decoder's outer loop (step 3a of
1158 // `DecoderRecognizer::recognize`) is what filters the
1159 // resulting partial-canonicalization candidate out.
1160 out.push_str(token);
1161 }
1162
1163 (out, features)
1164}
1165
1166/// Token characters: ASCII alphanumerics. `-` is handled by
1167/// [`scan_token`] as an internal separator.
1168fn is_token_char(c: char) -> bool {
1169 c.is_ascii_alphanumeric()
1170}
1171
1172/// Scan a token starting at `text[0]`. Returns the token length in
1173/// bytes. A token is a run of alphanumerics, with internal `-` allowed
1174/// between alphanumerics to support compounds like `SI-G` and
1175/// `SAR-BP`.
1176fn scan_token(text: &str) -> usize {
1177 let bytes = text.as_bytes();
1178 let mut i = 0;
1179 while i < bytes.len() {
1180 let b = bytes[i];
1181 let is_alnum = b.is_ascii_alphanumeric();
1182 let is_internal_hyphen =
1183 b == b'-' && i > 0 && i + 1 < bytes.len() && bytes[i + 1].is_ascii_alphanumeric();
1184 if is_alnum || is_internal_hyphen {
1185 i += 1;
1186 } else {
1187 break;
1188 }
1189 }
1190 i
1191}
1192
1193/// Map of CAPCO-2016-superseded tokens → their authoritative live
1194/// replacements. Each entry MUST cite a specific passage in
1195/// `crates/capco/docs/CAPCO-2016.md` (Constitution VIII). Adding an
1196/// entry without a verified citation is a correctness defect.
1197///
1198/// - `COMINT` → `SI`: CAPCO-2016 §H.4 p74 ("The COMINT title for the
1199/// Special Intelligence (SI) control system is no longer valid.")
1200/// inside §H.4 SCI Control System Markings.
1201const SUPERSEDED_TOKEN_MAP: &[(&str, &str)] = &[("COMINT", "SI")];
1202
1203// ---------------------------------------------------------------------------
1204// Position-aware short-token classification heuristic (issue #133 PR 2)
1205// ---------------------------------------------------------------------------
1206
1207/// Try to fix a malformed leading classification token using a
1208/// keyboard-proximity heuristic.
1209///
1210/// `MIN_FUZZY_LEN = 3` blocks the vocab-based fuzzy matcher from
1211/// running on 1- and 2-character tokens — `R`, `W`, `YS`, `XS` etc.
1212/// are too short for edit-distance to be reliable against the closed
1213/// vocabulary alone. But when such a token sits at the **leading
1214/// classification position** of a portion or banner marking, the
1215/// position itself is strong evidence: the user intended a
1216/// classification level, and the malformed token is almost certainly
1217/// keyboard-adjacent to a real one.
1218///
1219/// This helper applies a small keyboard-proximity table to the first
1220/// whitespace-separated token of the first `//`-separated segment.
1221/// It returns the corrected text (with the leading token replaced)
1222/// when a rule fires. Returns `None` when the leading token is
1223/// already canonical, longer than 2 chars, or doesn't match any
1224/// rule.
1225///
1226/// # Confidence
1227///
1228/// The decoder tags this attempt's [`CanonicalAttempt::fix_source`]
1229/// with [`FixSource::DecoderClassificationHeuristic`]. The engine
1230/// then (a) downgrades the diagnostic severity to
1231/// [`Severity::Warn`](marque_rules::Severity::Warn) — always-visible
1232/// in `--check`, exits non-zero — and (b) caps
1233/// [`Confidence::rule`](marque_rules::Confidence) at `0.80` so
1234/// `combined ≤ 0.80` stays below the default `confidence_threshold`
1235/// of `0.95`. The heuristic only auto-applies in `--fix` mode when
1236/// the user has explicitly lowered the threshold, opting into the
1237/// heuristic's bar of evidence.
1238///
1239/// # Rules (CAPCO-2016 §A.2 classification levels: U, R, C, S, TS)
1240///
1241/// Length is checked first — a 2-char token never reaches the 1-char
1242/// table. The keyboard-proximity sets are derived from the standard
1243/// QWERTY layout: keys physically adjacent to S (`A`, `W`, `E`, `Z`)
1244/// likely correspond to S typos; keys adjacent to T (`R`, `Y`, `H`,
1245/// `G`, `F`) likely correspond to T typos when followed by an
1246/// S-cluster character (so the pair maps to `TS`). The table is
1247/// intentionally narrow — wider sets produce more false positives
1248/// in normal prose.
1249///
1250/// **Length 3** (issue #133 PR 8) — exactly one mapping:
1251/// - `OTP` → `TOP` (T↔O transposition; standard Levenshtein dist 2,
1252/// blocked by `MIN_USEFUL_CONFIDENCE` for 3-char inputs at dist 2,
1253/// so the vocab path can't catch it even with `TOP` in vocab).
1254///
1255/// The 3-char rule is intentionally a single hardcoded mapping —
1256/// the dense 3-char trigraph vocab (`TON`, `TUR`, `TWN`, …, 289
1257/// entries) means a wider "all transpositions of TOP" rule
1258/// would generate too many false positives. Other corpus-attested
1259/// 3-char `TOP` typos (`TPP`, `UOP`) are at standard Levenshtein
1260/// dist 1 from the bare `TOP` in `EXTENDED_CORRECTION_VOCAB` and
1261/// recover via the vocab path; only transposition (which standard
1262/// Levenshtein scores as dist 2) needs the heuristic. See
1263/// [`try_3char_classification_heuristic`] for the implementation
1264/// and the `try_3char_classification_heuristic_only_matches_otp`
1265/// regression-pin for the narrow-scope policy.
1266///
1267/// **Length 2** (checked second):
1268/// - `[T, R, Y, H, G][A, W, E, Z, S]` → `TS` (e.g., `RS`, `YS`, `HE`)
1269/// - `[F][A, W, E, Z, S]` → `TS` (e.g., `FS`, `FE`)
1270/// - `TP` → `TOP` (issue #133 PR 8; corpus-attested keyboard typo
1271/// where the middle `O` was elided; bare `TP` has no other
1272/// canonical CAPCO meaning).
1273/// - `TO` → `TOP` (issue #133 PR 8; same family — trailing `P`
1274/// elided).
1275///
1276/// **Length 1**:
1277/// - `[A, W, E, Z]` → `S` (S-key neighbors; bare `S` is canonical)
1278/// - `[V, F]` → `C` (C-key neighbors; bare `C` is canonical)
1279/// - `[X]` → `S` (X is between C and S on QWERTY; default to the
1280/// higher classification per the issue #133 PR 2 design note)
1281///
1282/// **Length 4+**: returns `None`. Long-token typos benefit from the
1283/// vocab-based fuzzy matcher (4-char `TDOP`/`QTOP`/`TOPW` recover
1284/// to `TOP` at edit distance 1 via the standard fuzzy path now
1285/// that `TOP` lives in `EXTENDED_CORRECTION_VOCAB`); the
1286/// keyboard-proximity heuristic adds nothing here.
1287///
1288/// **Bare canonical**: returns `None` when the leading token is
1289/// already a known classification short form (`U`, `R`, `C`, `S`,
1290/// `TS`) OR the bare leading word `TOP` of the two-word
1291/// `TOP SECRET` classification. PR 8 added `TOP` to the canonical
1292/// short-circuit set because the new length-3 `OTP→TOP` heuristic
1293/// would otherwise have to walk the heuristic path on every
1294/// already-canonical `TOP SECRET//...` input. The strict parser
1295/// already accepts all of these. See
1296/// [`is_canonical_short_classification`] for the implementation.
1297///
1298/// # CAB markings
1299///
1300/// Returns `None` when `text` looks like a CAB (Classification
1301/// Authority Block) — those are keyed authority lines, not
1302/// classification-leading shapes, and the heuristic would emit
1303/// nonsense if applied. The check mirrors [`is_cab_head`].
1304fn try_classification_heuristic_fix(text: &str) -> Option<String> {
1305 // Skip CAB shapes — they don't have a leading classification token.
1306 if is_cab_head(text.as_bytes()) {
1307 return None;
1308 }
1309
1310 // Strip portion-form parens (preserve them at output).
1311 let (open_paren, body, close_paren) = if text.starts_with('(') && text.ends_with(')') {
1312 ("(", &text[1..text.len() - 1], ")")
1313 } else {
1314 ("", text, "")
1315 };
1316
1317 // First `//`-separated segment carries the leading classification.
1318 let first_seg_end = body.find("//").unwrap_or(body.len());
1319 let first_seg = &body[..first_seg_end];
1320 let after_first_seg = &body[first_seg_end..];
1321
1322 // First whitespace-delimited token of that segment.
1323 let first_seg_trimmed_start = first_seg
1324 .char_indices()
1325 .find(|(_, c)| !c.is_whitespace())
1326 .map(|(i, _)| i)
1327 .unwrap_or(0);
1328 let leading_ws = &first_seg[..first_seg_trimmed_start];
1329 let after_leading_ws = &first_seg[first_seg_trimmed_start..];
1330 let token_end = after_leading_ws
1331 .find(char::is_whitespace)
1332 .unwrap_or(after_leading_ws.len());
1333 let first_token = &after_leading_ws[..token_end];
1334 let after_first_token = &after_leading_ws[token_end..];
1335
1336 // Bare canonical → no fix needed.
1337 if is_canonical_short_classification(first_token) {
1338 return None;
1339 }
1340
1341 // **Lone-input safety guard (issue #133 PR 4 / #176).** Skip the
1342 // heuristic when the input has no marking-shape signal beyond the
1343 // leading token — i.e., nothing after the first token within the
1344 // first segment AND no `//`-separated tail. The corpus measurement
1345 // committed at `tools/corpus-analysis/output/heuristic_frequencies.json`
1346 // validated heuristic confidence well above the acceptance
1347 // threshold only for the *in-context* case (trigger appears within
1348 // ~30 chars of `//` or a recognized vocab token). For lone inputs
1349 // the empirical FP rate against Enron body text is many orders of
1350 // magnitude higher — high-frequency triggers like `A` and `E` have
1351 // tens of thousands of unrestricted occurrences vs at most a few
1352 // hundred in marking-context, and a fix-and-warn that auto-applies
1353 // at default threshold would produce false positives on
1354 // parenthetical refs like `(A)` / `(W)` / `(F)` common in business
1355 // prose. Spot-check the evidence file directly for per-trigger
1356 // detail.
1357 //
1358 // Form-field input (`(YS)` typed into a portion-mark field)
1359 // SHOULD heuristic-fix at high confidence — the caller knows the
1360 // input is a marking attempt — but we don't yet have an input-
1361 // source signal to distinguish form-field from document-content.
1362 // Tracked in #176 (input-source signal on ParseContext); when
1363 // that lands, this safety guard becomes conditional on
1364 // `ParseContext::input_source == DocumentContent`.
1365 // Trailing whitespace doesn't count as "other content" — `(YS )`
1366 // is functionally equivalent to `(YS)` for the lone-case test.
1367 let has_other_marking_content = after_first_token.chars().any(|c| !c.is_whitespace())
1368 || after_first_seg.chars().any(|c| !c.is_whitespace());
1369 if !has_other_marking_content {
1370 return None;
1371 }
1372
1373 let replacement = match first_token.len() {
1374 3 => try_3char_classification_heuristic(first_token)?,
1375 2 => try_2char_classification_heuristic(first_token)?,
1376 1 => try_1char_classification_heuristic(first_token)?,
1377 _ => return None,
1378 };
1379
1380 Some(format!(
1381 "{open_paren}{leading_ws}{replacement}{after_first_token}{after_first_seg}{close_paren}"
1382 ))
1383}
1384
1385/// True when `token` is a known CAPCO-2016 classification short
1386/// form (U, R, C, S, TS) OR the bare leading word of the
1387/// `TOP SECRET` two-word classification.
1388///
1389/// The full-word forms (UNCLASSIFIED, RESTRICTED, etc.) are
1390/// intentionally NOT matched here: a malformed full-word would
1391/// already be handled by the vocab-based fuzzy matcher (`SECRET`
1392/// is in `correction_vocab`).
1393///
1394/// Issue #133 PR 8 added `TOP` to the match set. Pre-PR-8 the
1395/// helper's whitespace tokenizer treated `TOP` as a non-canonical
1396/// token and the heuristic fired on perfectly-canonical
1397/// `TOP SECRET//...` input — a no-op when the heuristic returned
1398/// `None` for length-3 inputs, but a latent footgun once the
1399/// length-3 arm started returning `Some` (PR 8). Recognizing bare
1400/// `TOP` as canonical short-circuits the heuristic on the
1401/// already-correct case.
1402fn is_canonical_short_classification(token: &str) -> bool {
1403 matches!(token, "U" | "R" | "C" | "S" | "TS" | "TOP")
1404}
1405
1406/// 2-char keyboard-proximity rule. Two mappings:
1407///
1408/// 1. T-cluster + S-cluster pair → `TS` (the original PR 2 rule).
1409/// 2. Specific `TP` / `TO` pair → `TOP` (issue #133 PR 8). These
1410/// are corpus-attested classification typos where the middle
1411/// `O` (`TP`) or trailing `P` (`TO`) was elided. Bare `TP` and
1412/// `TO` have no other canonical CAPCO meaning at the leading
1413/// classification position — `TP` isn't an SCI control or
1414/// dissem, `TO` isn't either (the `REL TO` keyword path lives
1415/// inside the structural REL TO parser, not here).
1416///
1417/// The TS rule is checked first; rule 2 only fires when rule 1
1418/// doesn't (so `TS` itself, which has T-cluster + S-cluster, would
1419/// already be marked canonical by `is_canonical_short_classification`
1420/// upstream and the heuristic doesn't run on it).
1421fn try_2char_classification_heuristic(token: &str) -> Option<&'static str> {
1422 let bytes = token.as_bytes();
1423 debug_assert_eq!(bytes.len(), 2);
1424 let first = bytes[0].to_ascii_uppercase();
1425 let second = bytes[1].to_ascii_uppercase();
1426
1427 // T-key cluster: T itself plus QWERTY-adjacent keys (R, Y above-
1428 // adjacent on the home row; H, G, F on the row below). Wide
1429 // enough to catch the common transposition typos; narrow
1430 // enough to avoid touching unrelated 2-char prose.
1431 let t_cluster = matches!(first, b'T' | b'R' | b'Y' | b'H' | b'G' | b'F');
1432 // S-key cluster: S plus QWERTY-adjacent keys (A, W, E above-
1433 // adjacent on the upper row; Z below).
1434 let s_cluster = matches!(second, b'A' | b'W' | b'E' | b'Z' | b'S');
1435
1436 if t_cluster && s_cluster {
1437 return Some("TS");
1438 }
1439
1440 // PR 8: `TP` / `TO` → `TOP`. Tight pattern (literal pair, not
1441 // cluster) because broadening to e.g. `T[A-Z]` → `TOP` would
1442 // collide with too many real 2-char tokens in non-marking
1443 // prose. Anchored to T as the first byte and P / O as the
1444 // second.
1445 if first == b'T' && matches!(second, b'P' | b'O') {
1446 return Some("TOP");
1447 }
1448
1449 None
1450}
1451
1452/// 3-char keyboard-proximity rule (issue #133 PR 8). Maps a small
1453/// set of corpus-attested 3-char classification typos to their
1454/// canonical form when they appear in the leading classification
1455/// slot.
1456///
1457/// The vocab-based fuzzy matcher catches `TPP→TOP`, `UOP→TOP`, and
1458/// other distance-1 inputs once `TOP` lives in
1459/// `EXTENDED_CORRECTION_VOCAB`. This heuristic covers the residual
1460/// cases the fuzzy path can't reach:
1461///
1462/// - **`OTP` → `TOP`** — T↔O transposition. Standard Levenshtein
1463/// counts a transposition as 2 substitutions (distance 2), and
1464/// the fuzzy matcher's `MIN_USEFUL_CONFIDENCE` floor (0.45)
1465/// blocks distance-2 corrections for 3-char inputs (confidence
1466/// 0.40). Switching the matcher to Damerau-Levenshtein would
1467/// recover this case but expand the false-positive surface
1468/// across the whole vocab; a targeted heuristic at the
1469/// classification slot is the lower-blast-radius fix.
1470///
1471/// Returns `None` for any other 3-char input — the heuristic is
1472/// intentionally narrow to avoid false positives in the dense
1473/// 3-char trigraph vocab (`TON`, `TUR`, `TWN`, …).
1474fn try_3char_classification_heuristic(token: &str) -> Option<&'static str> {
1475 let bytes = token.as_bytes();
1476 debug_assert_eq!(bytes.len(), 3);
1477 // Uppercase comparison is unnecessary here because the
1478 // `normalize_delimiters_and_case` pass upstream uppercases
1479 // ASCII before this helper runs, but we mirror the
1480 // length-1 / length-2 helpers' style for consistency.
1481 let upper = [
1482 bytes[0].to_ascii_uppercase(),
1483 bytes[1].to_ascii_uppercase(),
1484 bytes[2].to_ascii_uppercase(),
1485 ];
1486 if upper == *b"OTP" {
1487 return Some("TOP");
1488 }
1489 None
1490}
1491
1492/// 1-char keyboard-proximity rule. Maps to S, C per the §A.2 short-
1493/// form classification ladder. See module-level table for the
1494/// per-character mapping rationale.
1495fn try_1char_classification_heuristic(token: &str) -> Option<&'static str> {
1496 let bytes = token.as_bytes();
1497 debug_assert_eq!(bytes.len(), 1);
1498 match bytes[0].to_ascii_uppercase() {
1499 b'A' | b'W' | b'E' | b'Z' => Some("S"),
1500 b'V' | b'F' => Some("C"),
1501 // X is between C and S on QWERTY; default to the higher
1502 // classification (S) per the issue #133 PR 2 design note —
1503 // false-negative cost (under-classified) > false-positive
1504 // cost (over-classified) for IC compliance work.
1505 b'X' => Some("S"),
1506 _ => None,
1507 }
1508}
1509
1510// ---------------------------------------------------------------------------
1511// Missing-delimiter insertion (issue #133 PR 3)
1512// ---------------------------------------------------------------------------
1513
1514/// Try to insert missing `//` segment separators at category-transition
1515/// boundaries.
1516///
1517/// CAPCO grammar requires `//` between segments —
1518/// `CLASSIFICATION//SCI_BLOCK//SAR_BLOCK//DISSEM_BLOCK`. Real-world
1519/// transcription frequently substitutes whitespace for one or more
1520/// `//` separators, producing inputs the strict parser cannot
1521/// recover (`SECRET//NOFORN EXDIS` strict-parses as
1522/// `classification: Secret, dissem: [Nf]` with `EXDIS` left as
1523/// `TokenKind::Unknown`; the decoder's step-3a Unknown-span filter
1524/// then discards the candidate).
1525///
1526/// This helper walks the input left-to-right and inserts `//` at
1527/// whitespace gaps that separate two distinct CAPCO segments. Two
1528/// rules drive insertion:
1529///
1530/// 1. **Classification → next segment.** Tokens at the start of the
1531/// input are classification-context (`U`, `R`, `C`, `S`, `TS`,
1532/// `UNCLASSIFIED`, …, plus the `TOP SECRET` two-word
1533/// classification). The first non-classification token after the
1534/// classification phrase, when no `//` has been emitted yet,
1535/// triggers `//` insertion before it. Covers the
1536/// `TOP SECRET HCS-P INTEL OPS//ORCON/NOFORN` / `SECRET REL TO
1537/// USA, AUS, GBR` family.
1538///
1539/// 2. **Hard-splitter dissem long-form.** A small set of unambiguous
1540/// long-form dissem control tokens (`NOFORN`, `ORCON`,
1541/// `ORCON-USGOV`, `PROPIN`, `IMCON`, `RELIDO`, `RSEN`,
1542/// `EYESONLY`, `EXDIS`, `NODIS`, `LIMDIS`, `FOUO`, `FISA`,
1543/// `DSEN`) ALWAYS start a new segment when they appear after a
1544/// whitespace gap, regardless of preceding context — these
1545/// tokens have no in-segment role inside SCI/SAR/REL TO
1546/// blocks. Covers the `NOFORN EXDIS` / `... SI NOFORN` /
1547/// `... HCS-P INTEL OPS ORCON/NOFORN` family. The full set is
1548/// pinned by [`is_hard_splitter_covers_documented_long_forms`].
1549///
1550/// Exceptions (do NOT insert):
1551///
1552/// - `SBU NOFORN` / `LES NOFORN` — non-IC dissem **banner long
1553/// forms** for `NonIcDissem::SbuNf` / `NonIcDissem::LesNf`. When
1554/// the previous token is `SBU` or `LES`, treat `NOFORN` as part
1555/// of the multi-word atom.
1556///
1557/// Returns `None` when no insertion was made — the caller should
1558/// not emit a duplicate of the input.
1559///
1560/// # Bounded
1561///
1562/// Hard-capped at [`MAX_DELIMITER_INSERTIONS`] insertions per call.
1563/// More than four insertions in a single marking is suspicious and
1564/// likely indicates the input isn't a CAPCO marking at all (or the
1565/// helper is wrong); rather than emit a wildly-rewritten candidate,
1566/// we cap and let the result strict-parse on the partial rewrite.
1567///
1568/// # SCI / SAR / SPECIAL-ACCESS-REQUIRED coverage
1569///
1570/// The PR-3-era doc note here used to defer SCI-starter (`TOP SECRET
1571/// SI ...`), SAR-prefix (`TOP SECRET SAR-BP ...`), and
1572/// `SPECIAL ACCESS REQUIRED-...` insertion to a follow-up. That defer
1573/// was based on a misread: rule 1 (classification → next segment)
1574/// already fires on every one of those shapes because
1575/// [`is_classification_token`] includes `TOP` and
1576/// [`is_classification_continuation`] handles the `TOP → SECRET`
1577/// special case, so the helper produces the canonical bytes for all
1578/// 17 MissingDelimiter fixtures in the SC-004 corpus. The remaining
1579/// 2/17 failures pre-PR-5 were a SCORING contest, not a missing
1580/// rewrite — handled by [`HARD_SPLITTER_ABSORPTION_PENALTY`] in
1581/// [`score_candidate`], not here.
1582fn try_insert_delimiter(text: &str) -> Option<String> {
1583 let bytes = text.as_bytes();
1584 let mut result = String::with_capacity(text.len() + 8);
1585 let mut insertions = 0;
1586
1587 let mut prev_token: Option<&str> = None;
1588 let mut in_classification = true;
1589 let mut seen_double_slash = false;
1590
1591 let mut i = 0;
1592 while i < bytes.len() {
1593 // Existing `//` delimiter — copy and reset state.
1594 if bytes[i] == b'/' && i + 1 < bytes.len() && bytes[i + 1] == b'/' {
1595 result.push_str("//");
1596 seen_double_slash = true;
1597 in_classification = false;
1598 prev_token = None;
1599 i += 2;
1600 continue;
1601 }
1602
1603 // Whitespace run — collect, then look at next token.
1604 if bytes[i].is_ascii_whitespace() {
1605 let ws_start = i;
1606 while i < bytes.len() && bytes[i].is_ascii_whitespace() {
1607 i += 1;
1608 }
1609 let ws = &text[ws_start..i];
1610
1611 // Find the next token (alnum + internal `-`) starting at `i`.
1612 let token_start = i;
1613 while i < bytes.len() && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'-') {
1614 i += 1;
1615 }
1616 if token_start == i {
1617 // Whitespace then non-token character (e.g., `,` or `/` or end).
1618 // Just copy the whitespace and continue.
1619 result.push_str(ws);
1620 continue;
1621 }
1622 let next_token = &text[token_start..i];
1623
1624 let should_insert = decide_insert_delimiter(
1625 prev_token,
1626 next_token,
1627 in_classification,
1628 seen_double_slash,
1629 );
1630
1631 if should_insert && insertions < MAX_DELIMITER_INSERTIONS {
1632 result.push_str("//");
1633 insertions += 1;
1634 seen_double_slash = true;
1635 in_classification = false;
1636 } else {
1637 result.push_str(ws);
1638 }
1639 result.push_str(next_token);
1640
1641 // Update state.
1642 if !is_classification_continuation(next_token, prev_token) {
1643 in_classification = false;
1644 }
1645 prev_token = Some(next_token);
1646 continue;
1647 }
1648
1649 // Non-whitespace, non-`//` character — likely a `/` (single
1650 // slash, used as intra-segment separator e.g.
1651 // `ORCON/NOFORN`), comma, paren, or part of a token. Copy
1652 // verbatim and continue. Tokens that contain only alnum + `-`
1653 // are handled in the whitespace branch via the lookahead;
1654 // the leading-token-at-position-0 case enters here.
1655 let other_start = i;
1656 // Take a token (alnum + internal `-`) if at one.
1657 if bytes[i].is_ascii_alphanumeric() {
1658 while i < bytes.len() && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'-') {
1659 i += 1;
1660 }
1661 let leading_token = &text[other_start..i];
1662 result.push_str(leading_token);
1663 // Update prev_token / classification state for the
1664 // leading token (no insertion possible at position 0).
1665 if !is_classification_continuation(leading_token, prev_token) {
1666 in_classification = false;
1667 }
1668 prev_token = Some(leading_token);
1669 continue;
1670 }
1671
1672 // Single non-token character (`/`, `(`, `)`, `,`, or any
1673 // non-ASCII character — e.g., a stray `∕` that the upstream
1674 // delimiter normalizer didn't catch). Preserve the original
1675 // UTF-8 character verbatim instead of doing `bytes[i] as
1676 // char`, which would corrupt multi-byte sequences by emitting
1677 // each byte as a separate Latin-1 codepoint.
1678 let ch = text[i..]
1679 .chars()
1680 .next()
1681 .expect("byte index must remain on a char boundary");
1682 result.push(ch);
1683 i += ch.len_utf8();
1684 }
1685
1686 if insertions == 0 { None } else { Some(result) }
1687}
1688
1689/// Hard cap on the number of `//` insertions per call. More than 4
1690/// in a single marking is very suspicious — real markings rarely
1691/// have that many segments at all. The cap prevents the helper
1692/// from rewriting non-marking prose that happens to contain
1693/// splitter words.
1694const MAX_DELIMITER_INSERTIONS: usize = 4;
1695
1696/// Decide whether to insert `//` at a whitespace gap before
1697/// `next_token`. See [`try_insert_delimiter`] doc for the rules.
1698fn decide_insert_delimiter(
1699 prev_token: Option<&str>,
1700 next_token: &str,
1701 in_classification: bool,
1702 seen_double_slash: bool,
1703) -> bool {
1704 // Multi-word atom exceptions: don't split between SBU/LES and
1705 // their NOFORN companion (banner long forms for NonIcDissem
1706 // SbuNf/LesNf).
1707 if next_token == "NOFORN" && matches!(prev_token, Some("SBU") | Some("LES")) {
1708 return false;
1709 }
1710
1711 // Rule 1: classification → next segment. The first non-
1712 // classification token after the classification phrase, when no
1713 // `//` has been emitted yet.
1714 if in_classification && !seen_double_slash && !is_classification_token(next_token) {
1715 return true;
1716 }
1717
1718 // Rule 2: hard-splitter dissem long-form. These tokens always
1719 // start a new segment when they appear after whitespace.
1720 is_hard_splitter(next_token)
1721}
1722
1723/// True when `token` is a classification short or long form that
1724/// can appear in classification context.
1725fn is_classification_token(token: &str) -> bool {
1726 matches!(
1727 token,
1728 "U" | "R"
1729 | "C"
1730 | "S"
1731 | "TS"
1732 | "TOP"
1733 | "UNCLASSIFIED"
1734 | "RESTRICTED"
1735 | "CONFIDENTIAL"
1736 | "SECRET"
1737 )
1738}
1739
1740/// True when `next_token` continues the classification phrase from
1741/// `prev_token`. Specifically: `TOP SECRET` is the only multi-word
1742/// classification CAPCO recognizes; `SECRET` after `TOP` continues
1743/// the classification.
1744fn is_classification_continuation(next_token: &str, prev_token: Option<&str>) -> bool {
1745 if next_token == "SECRET" && prev_token == Some("TOP") {
1746 return true;
1747 }
1748 is_classification_token(next_token)
1749}
1750
1751/// True when `token` is an unambiguous segment-starting dissem
1752/// long-form. These tokens have no in-segment role inside SCI / SAR /
1753/// REL TO blocks, so seeing one after whitespace always indicates a
1754/// missing `//` separator. Pinned by
1755/// `try_insert_delimiter_inserts_before_long_form_dissem`.
1756///
1757/// Excluded from this set:
1758///
1759/// - 2-char short forms (`NF`, `OC`, `PR`, `IMC`, `RS`) — could
1760/// collide with SAR compartment / sub-compartment naming.
1761/// - SCI starters (`SI`, `HCS`, `TK`, `KDK`) — 2-3 char tokens that
1762/// appear in compartment context.
1763/// - SAR prefixes (`SAR-*`) — handled in v2 with classification-
1764/// context lookahead.
1765fn is_hard_splitter(token: &str) -> bool {
1766 matches!(
1767 token,
1768 "NOFORN"
1769 | "ORCON"
1770 | "ORCON-USGOV"
1771 | "PROPIN"
1772 | "IMCON"
1773 | "RELIDO"
1774 | "RSEN"
1775 | "EYESONLY"
1776 | "FOUO"
1777 | "FISA"
1778 | "DSEN"
1779 | "EXDIS"
1780 | "NODIS"
1781 | "LIMDIS"
1782 )
1783}
1784
1785// ---------------------------------------------------------------------------
1786// SAR indicator-keyword structural repair (issue #133 PR 6)
1787// ---------------------------------------------------------------------------
1788
1789/// Repair stray-prefix and missing-hyphen mangling around the SAR
1790/// `SAR-` indicator (CAPCO-2016 §H.5 p100). Two structural patterns:
1791///
1792/// 1. **Prefix strip** — `<boundary>[A-Z]{1,3}SAR-` → `<boundary>SAR-`.
1793/// Strips ANY attached 1–3 letter ASCII-uppercase prefix before
1794/// the SAR indicator, including prefixes whose bytes happen to
1795/// spell a known CAPCO token (`U`, `S`, `SI`, `USA`, …). Canonical
1796/// CAPCO never glues a classification token, SCI control, or
1797/// trigraph directly to `SAR-` without a `//` separator, so a
1798/// prefix at a `//`/`(`/start boundary is OCR/transcription drift
1799/// regardless of whether the prefix bytes form a CVE token in
1800/// isolation. Recovers `SECRET//USAR-BP-J12...` →
1801/// `SECRET//SAR-BP-J12...` and `(USASAR-BP)` → `(SAR-BP)`. The
1802/// "smallest prefix that aligns with `SAR-`" wins (see
1803/// [`match_sar_prefix`]) so an ambiguous input like `USASAR-`
1804/// strips the longest aligning prefix (`USA`, length 3) — there
1805/// is no shorter alignment because `USASAR-` only contains `SAR-`
1806/// starting at offset 3. An earlier defensive guard that refused
1807/// to strip CAPCO-token prefixes was removed because it broke
1808/// the central `USAR-` case (`U` IS the UNCLASSIFIED portion
1809/// form); the test
1810/// `sar_indicator_repair_strips_even_capco_token_prefix` pins
1811/// the policy.
1812///
1813/// 2. **Missing-hyphen insertion** — `<boundary>SAR[A-Z0-9]{2,3}<delim>`
1814/// → `<boundary>SAR-[A-Z0-9]{2,3}<delim>`, where `<delim>` is `-`,
1815/// `/`, ASCII whitespace, or end-of-string. Recovers
1816/// `TOP SECRET//SARBP//NOFORN` → `TOP SECRET//SAR-BP//NOFORN` and
1817/// `SARBP-J12` → `SAR-BP-J12`.
1818///
1819/// Returns `None` when no change was made; the caller's `emit` dedup
1820/// would otherwise drop the duplicate candidate but the explicit
1821/// `None` saves the alloc.
1822///
1823/// # Why these patterns are structurally safe
1824///
1825/// Both patterns operate on the SAR **indicator keyword** (the literal
1826/// `SAR-` per §H.5 p100), not on the open-vocabulary program
1827/// identifier that follows. A prefix strip removes characters that
1828/// have no role in the CAPCO grammar — there is no marking syntax
1829/// where 1–3 alphabetic characters precede `SAR-` at a `//`/`(`/
1830/// start-of-string boundary. A missing-hyphen insertion adds the
1831/// syntactic separator the §H.5 grammar requires between the indicator
1832/// and the program identifier; it does not invent or modify the
1833/// identifier itself. Neither fix claims anything about SAR program-
1834/// identifier validity (which is agency-assigned and outside the
1835/// marque vocab — see `SAR_STRUCTURAL_KEYWORDS` in
1836/// `crates/ism/src/token_set.rs`). The corpus enhancement to fuzzy-
1837/// match against per-org SAR identifier lists is intentionally
1838/// deferred (issue follow-up): config-loaded vocab is a separate
1839/// trust boundary that needs its own design pass.
1840///
1841/// `SPECIAL ACCESS REQUIRED-` (the `Full` indicator form) is NOT
1842/// handled by this helper. The dominant `Full`-form failure mode in
1843/// the mangled corpus is a typo inside the indicator keywords
1844/// themselves (`SPCIAL`, `CCESS`, `SPECAL`), which is recovered by
1845/// the existing fuzzy matcher now that `SPECIAL` and `ACCESS` live in
1846/// `SAR_STRUCTURAL_KEYWORDS`. A `Full`-form analogue can land if a
1847/// future fixture surfaces with a stray prefix on
1848/// `SPECIAL ACCESS REQUIRED-`.
1849fn try_sar_indicator_repair(text: &str) -> Option<String> {
1850 // Cheap pre-check: if `SAR` doesn't appear at all, no repair is
1851 // possible. Saves the byte-walk cost on the overwhelmingly common
1852 // case where the input has no SAR block.
1853 if !text.contains("SAR") {
1854 return None;
1855 }
1856
1857 let bytes = text.as_bytes();
1858 // Lazy allocation: `result` stays `None` until the first repair
1859 // pattern matches, at which point we allocate and copy the
1860 // verbatim prefix `text[..first_match_start]` into it. Inputs that
1861 // contain `SAR` but no repair-eligible pattern (the common case
1862 // for canonical SAR markings like `SECRET//SAR-BP//NOFORN`) walk
1863 // the bytes without ever allocating the output string. The
1864 // bytes-walk-only-no-alloc path matters because every candidate
1865 // bytes attempt the decoder generates calls into this helper, so
1866 // a per-call allocation would multiply allocator pressure across
1867 // the K candidates / N inputs hot path of the recognizer.
1868 let mut result: Option<String> = None;
1869 // `last_copied` is the byte index up to which `result` has been
1870 // populated. When a repair fires, we batch-copy the verbatim span
1871 // `text[last_copied..i]` into `result` before pushing the
1872 // canonical replacement; on the final return we flush
1873 // `text[last_copied..]`. The batch-copy approach also avoids the
1874 // per-character `chars().next()` UTF-8 iteration cost on the
1875 // verbatim-byte stretches.
1876 let mut last_copied: usize = 0;
1877 let mut i = 0;
1878
1879 while i < bytes.len() {
1880 let at_boundary =
1881 i == 0 || matches!(bytes[i - 1], b'/' | b'(' | b' ' | b'\t' | b'\n' | b'\r');
1882
1883 if at_boundary {
1884 // Pattern A: <prefix>SAR- where prefix is 1-3 ASCII
1885 // uppercase letters. The prefix is always treated as
1886 // noise to be stripped; a "known CAPCO word" defense
1887 // (refuse to strip if `U`, `USA`, `SI`, …) was tried
1888 // and rejected because it broke the central
1889 // `USAR-` case — `U` IS a CVE token (the
1890 // classification portion form for UNCLASSIFIED) but
1891 // canonical CAPCO never glues `U` directly to `SAR-`
1892 // without a `//` separator. Same logic applies to every
1893 // other CVE token in this position: a classification or
1894 // SCI control or trigraph that immediately precedes
1895 // `SAR-` with no separator is not a valid CAPCO marking
1896 // shape (the classification segment ends, `//` begins
1897 // the next segment, then SAR- starts the SAR block).
1898 // So an apparent prefix at a boundary directly before
1899 // `SAR-` is OCR/transcription drift regardless of
1900 // whether the prefix bytes spell a CAPCO token.
1901 if let Some((_prefix_len, post)) = match_sar_prefix(bytes, i) {
1902 let r = result.get_or_insert_with(|| String::with_capacity(text.len() + 4));
1903 r.push_str(&text[last_copied..i]);
1904 r.push_str("SAR-");
1905 last_copied = post;
1906 i = post;
1907 continue;
1908 }
1909
1910 // Pattern B: SAR<2-3 alnum><delim>. The CAPCO §H.5 p100
1911 // SAR program identifier (Abbrev form) is exactly 2-3
1912 // alphanumeric characters; the canonical form requires a
1913 // hyphen between SAR and the identifier. Inserting that
1914 // hyphen does not invent identifier vocabulary.
1915 if let Some(end) = match_sar_missing_hyphen(bytes, i) {
1916 let r = result.get_or_insert_with(|| String::with_capacity(text.len() + 4));
1917 r.push_str(&text[last_copied..i]);
1918 r.push_str("SAR-");
1919 r.push_str(&text[i + 3..end]);
1920 last_copied = end;
1921 i = end;
1922 continue;
1923 }
1924 }
1925
1926 // Default: advance past the current UTF-8 char without copying.
1927 // The verbatim span [last_copied..i] gets batch-copied into
1928 // `result` the next time a repair pattern fires (or flushed
1929 // on return below). Using char iteration rather than
1930 // `bytes[i] as char` keeps `i` aligned to char boundaries so
1931 // the `text[last_copied..i]` slice indexing is always valid
1932 // — multi-byte sequences (rare but possible in OCR'd input)
1933 // therefore round-trip intact.
1934 let ch = text[i..]
1935 .chars()
1936 .next()
1937 .expect("byte index must remain on a char boundary");
1938 i += ch.len_utf8();
1939 }
1940
1941 // Flush any verbatim trailing span into the result. If `result`
1942 // is still `None`, no repair fired, and we never allocated —
1943 // return `None` to signal the no-op path.
1944 result.map(|mut r| {
1945 r.push_str(&text[last_copied..]);
1946 r
1947 })
1948}
1949
1950/// At byte position `i`, look for `[A-Z]{1,3}SAR-`. Returns
1951/// `(prefix_len, post_index)` where `post_index` is the byte index
1952/// just after the `-` of `SAR-`. Returns `None` when the pattern
1953/// doesn't match.
1954///
1955/// Tries prefix lengths 1, 2, 3 in order; the **smallest** prefix
1956/// that aligns with a literal `SAR-` wins. The smallest-wins policy
1957/// is a conservative choice: a 1-char prefix (`U` in `USAR-`) is the
1958/// most likely OCR/transcription drift, and stripping fewer characters
1959/// is the lower-risk repair when the input is ambiguous between
1960/// shorter and longer prefix interpretations.
1961fn match_sar_prefix(bytes: &[u8], i: usize) -> Option<(usize, usize)> {
1962 for prefix_len in 1..=3 {
1963 let sar_start = i + prefix_len;
1964 if sar_start + 4 > bytes.len() {
1965 break;
1966 }
1967 if !bytes[i..sar_start].iter().all(|b| b.is_ascii_uppercase()) {
1968 break;
1969 }
1970 if &bytes[sar_start..sar_start + 4] == b"SAR-" {
1971 return Some((prefix_len, sar_start + 4));
1972 }
1973 }
1974 None
1975}
1976
1977/// At byte position `i`, look for `SAR[A-Z0-9]{2,3}<delim>`. Returns
1978/// the byte index of the delimiter (one past the alphanumeric run).
1979/// Returns `None` when the pattern doesn't match — including the
1980/// canonical `SAR-` shape (alnum run is 0 because `-` stops the scan
1981/// immediately after `SAR`).
1982fn match_sar_missing_hyphen(bytes: &[u8], i: usize) -> Option<usize> {
1983 if i + 3 > bytes.len() || &bytes[i..i + 3] != b"SAR" {
1984 return None;
1985 }
1986 let after_sar = i + 3;
1987 let mut j = after_sar;
1988 while j < bytes.len() && bytes[j].is_ascii_alphanumeric() {
1989 j += 1;
1990 }
1991 let run = j - after_sar;
1992 if !(2..=3).contains(&run) {
1993 return None;
1994 }
1995 let next_is_delim =
1996 j == bytes.len() || matches!(bytes[j], b'-' | b'/' | b' ' | b'\t' | b'\n' | b'\r');
1997 if !next_is_delim {
1998 return None;
1999 }
2000 Some(j)
2001}
2002
2003// ---------------------------------------------------------------------------
2004// Stray-character `/X/` recovery (issue #133 PR 7)
2005// ---------------------------------------------------------------------------
2006
2007/// Walk `text` looking for the `<alnum>/<single_alnum_char>/<alnum>`
2008/// pattern. For each match (currently only the first match is
2009/// processed — see "scope" below) emit three candidate transforms:
2010///
2011/// 1. **Drop X** — `A/X/B` → `A//B`. Recovers stray characters
2012/// inserted between two valid tokens. Example:
2013/// `SECRET//NOFORN/R/EXDIS` → `SECRET//NOFORN//EXDIS` (the stray
2014/// `/R/` between NOFORN and EXDIS is removed).
2015///
2016/// 2. **Right-attach X** — `A/X/B` → `A//XB`. Recovers a single
2017/// character that got separated from the start of the right
2018/// token by a `/`. Example: `TOP SECRET//SI/N/OFORN` →
2019/// `TOP SECRET//SI//NOFORN` (the `N` was the leading character
2020/// of `NOFORN`).
2021///
2022/// 3. **Left-attach X** — `A/X/B` → `AX//B`. Recovers a single
2023/// character that got separated from the end of the left token
2024/// by a `/`. Example: `SECRE/T/REL TO USA, AUS, GBR` →
2025/// `SECRET//REL TO USA, AUS, GBR` (the `T` was the trailing
2026/// character of `SECRET`).
2027///
2028/// All three transforms are emitted as candidates; the recognizer's
2029/// step-3a [`TokenKind::Unknown`](marque_ism::TokenKind::Unknown)
2030/// filter is the natural disambiguator. For each input only one of
2031/// the three transforms produces fully-recognized tokens — the
2032/// other two leave broken-token fragments (`OFORN`, `NOFORNR`,
2033/// `SECRER`, …) that survive strict parsing as `TokenKind::Unknown`
2034/// and get dropped before scoring. The decoder doesn't need a
2035/// per-pattern lookup table to choose the right transform; the
2036/// vocab does the choosing implicitly.
2037///
2038/// # Scope (PR 7)
2039///
2040/// Only the FIRST `/X/` match in the input is processed; an input
2041/// with multiple stray-character patterns (e.g., `S/I/T/K`) is not
2042/// fully recovered by a single pass. The current corpus has very
2043/// few multi-pattern inputs (1–2 in the unresolved Typo set), and
2044/// adding a multi-pass loop here would complicate the candidate cap
2045/// in [`generate_candidate_bytes`] without proportional benefit. A
2046/// future PR can iterate if multi-pattern recovery becomes
2047/// load-bearing for SC-004 movement.
2048///
2049/// # Pattern boundary requirements
2050///
2051/// The `/X/` match requires alphanumeric context on both sides
2052/// (`<alnum>/<X>/<alnum>`). Without those guards the pattern would
2053/// fire on edge cases like `(/X/)` (start of portion form) where
2054/// the surrounding context is structural punctuation, not a token —
2055/// the recovery would be semantically meaningless there because
2056/// there's no token to attach `X` to.
2057fn try_collapse_stray_char_slash(text: &str) -> Vec<String> {
2058 let bytes = text.as_bytes();
2059 let mut i = 0;
2060 while i + 3 <= bytes.len() {
2061 // `/X/` shape: bytes[i] = `/`, bytes[i+1] = single ASCII
2062 // alnum, bytes[i+2] = `/`. The single-alnum requirement
2063 // prevents matching on `/AB/` (which would be a 2-char
2064 // token between slashes, not a stray character).
2065 if bytes[i] != b'/' || !bytes[i + 1].is_ascii_alphanumeric() || bytes[i + 2] != b'/' {
2066 i += 1;
2067 continue;
2068 }
2069 // Boundary check: the slashes must be sandwiched between
2070 // alphanumeric tokens on both sides. Without this guard
2071 // `(/X/)` (start-of-portion-form) would trip the match.
2072 let prev_alnum = i > 0 && bytes[i - 1].is_ascii_alphanumeric();
2073 let next_alnum = i + 3 < bytes.len() && bytes[i + 3].is_ascii_alphanumeric();
2074 if !prev_alnum || !next_alnum {
2075 i += 1;
2076 continue;
2077 }
2078
2079 let x = bytes[i + 1];
2080 let prefix = &bytes[..i];
2081 let suffix = &bytes[i + 3..];
2082
2083 // The unwraps are safe: `text` is valid UTF-8, `prefix` /
2084 // `suffix` are slices on byte boundaries (the pattern only
2085 // matched on ASCII bytes), and we only insert ASCII bytes
2086 // (`/`, `x` which is ASCII alnum) between them.
2087 let mut out = Vec::with_capacity(3);
2088
2089 // 1. Drop X.
2090 let mut buf = Vec::with_capacity(bytes.len());
2091 buf.extend_from_slice(prefix);
2092 buf.extend_from_slice(b"//");
2093 buf.extend_from_slice(suffix);
2094 out.push(String::from_utf8(buf).expect("ASCII insertions on UTF-8 prefix/suffix"));
2095
2096 // 2. Right-attach X.
2097 let mut buf = Vec::with_capacity(bytes.len());
2098 buf.extend_from_slice(prefix);
2099 buf.extend_from_slice(b"//");
2100 buf.push(x);
2101 buf.extend_from_slice(suffix);
2102 out.push(String::from_utf8(buf).expect("ASCII insertions on UTF-8 prefix/suffix"));
2103
2104 // 3. Left-attach X.
2105 let mut buf = Vec::with_capacity(bytes.len());
2106 buf.extend_from_slice(prefix);
2107 buf.push(x);
2108 buf.extend_from_slice(b"//");
2109 buf.extend_from_slice(suffix);
2110 out.push(String::from_utf8(buf).expect("ASCII insertions on UTF-8 prefix/suffix"));
2111
2112 return out;
2113 }
2114 Vec::new()
2115}
2116
2117// ---------------------------------------------------------------------------
2118// REL TO structural repair (issue #133 PR 9)
2119// ---------------------------------------------------------------------------
2120
2121/// REL TO structural repair.
2122///
2123/// Recovers four classes of REL TO structural typos that produce no
2124/// valid REL TO block in the strict parse path. All four are
2125/// **structural** (literal-shape) repairs, not vocabulary-based fuzzy
2126/// guesses — they fire only when the observed pattern is invalid
2127/// CAPCO AND the corrected pattern is unambiguously the intended form.
2128/// The riskier per-trigraph fuzzy-correction cluster (e.g.,
2129/// `USB → USA`, `AUT → AUS`) is deferred to issue #186 because it
2130/// requires corpus-weighted priors + block-level CAPCO §H.8
2131/// invariants to disambiguate safely.
2132///
2133/// # Patterns
2134///
2135/// 1. **Header transposition** — `REL OT ` → `REL TO `. The CAPCO
2136/// `REL` token has exactly two valid extensions (`REL TO` and
2137/// `RELIDO`); `REL OT` cannot appear in any valid CAPCO marking,
2138/// so the literal-bytes replacement is collision-free.
2139///
2140/// 2. **Header token-boundary** — `RELT O ` → `REL TO `. `RELT` is
2141/// not a CVE token, and `T O` as adjacent single-letter tokens
2142/// has no valid CAPCO meaning. The replacement reconstructs the
2143/// intended `REL TO ` header by migrating the trailing `T` from
2144/// `RELT` to the start of `O`.
2145///
2146/// 3. **Entry token-boundary** — `,A US,` → `,AUS,` (within a
2147/// REL TO block). A 1-letter + space + 2-letter sequence between
2148/// commas only fires when the joined 3-letter string is a known
2149/// trigraph (`is_trigraph` check) AND the 1-letter alone is not a
2150/// trigraph. The trigraph guard is what makes this safe — without
2151/// it, `,A B,` → `,AB,` would fire for any combination, but with
2152/// it the only joins that survive are those that round-trip
2153/// through the strict REL TO parser as valid trigraphs.
2154///
2155/// 4. **Entry comma misplacement** — `AU,S ` → `AUS, ` (within a
2156/// REL TO block). A 2-letter run + comma + 1-letter + space only
2157/// fires when the joined 3-letter string is a known trigraph AND
2158/// the 2-letter run alone is not. Same trigraph guard as
2159/// pattern 3 — the structural transform requires the corrected
2160/// output to be a valid trigraph.
2161///
2162/// # Scope (PR 9)
2163///
2164/// Patterns 1 and 2 affect the literal `REL TO` header and run
2165/// regardless of what follows. Patterns 3 and 4 require a `REL TO `
2166/// header in the input — they scan from each `REL TO ` substring
2167/// forward to the next `//` (or end of text) and only operate on
2168/// comma-separated entries within that block.
2169///
2170/// All four transforms are conservative: their false-positive risk
2171/// is bounded by the literal patterns not appearing in any valid
2172/// CAPCO text (patterns 1, 2) or by the `is_trigraph` guard
2173/// rejecting joins that aren't real country codes (patterns 3, 4).
2174/// The trigraph dictionary itself is the source of authority — no
2175/// new vocabulary is invented.
2176///
2177/// Returns `None` when no pattern matched. Allocation behavior:
2178///
2179/// - Inputs with no `REL` substring short-circuit before any work.
2180/// - Inputs with `REL` but no header-typo pattern run the header
2181/// walk allocation-free; the entry-level pass then short-circuits
2182/// on inputs lacking a literal `REL TO ` anchor.
2183/// - Inputs containing `REL TO ` in canonical form walk the entries
2184/// without allocating until a fix actually fires.
2185///
2186/// Allocation only occurs once a pattern produces a fixed string.
2187fn try_rel_to_structural_repair(text: &str) -> Option<String> {
2188 // Cheap pre-check: if `REL` doesn't appear at all, no repair is
2189 // possible. Saves the byte-walk cost on the overwhelmingly common
2190 // case where the input has no REL block.
2191 if !text.contains("REL") {
2192 return None;
2193 }
2194
2195 let mut working: Option<String> = None;
2196 let mut any_change = false;
2197
2198 // Patterns 1 and 2: header normalization. Apply first so the
2199 // entry-level scan in patterns 3 and 4 sees a canonical `REL TO `
2200 // header to anchor on.
2201 if let Some(normalized) = try_rel_to_header_normalize(text) {
2202 working = Some(normalized);
2203 any_change = true;
2204 }
2205
2206 // Patterns 3 and 4: entry-level fixes. Operate on the
2207 // header-normalized text when patterns 1 or 2 fired, otherwise on
2208 // the raw input.
2209 let entry_input: &str = working.as_deref().unwrap_or(text);
2210 if let Some(entry_fixed) = try_rel_to_entry_normalize(entry_input) {
2211 working = Some(entry_fixed);
2212 any_change = true;
2213 }
2214
2215 if any_change { working } else { None }
2216}
2217
2218/// Patterns 1 and 2 — header normalization.
2219///
2220/// Walks `text` once, replacing each occurrence of `REL OT ` and
2221/// `RELT O ` (each at a token boundary) with `REL TO `. Lazy-allocates
2222/// the output string only on the first match — inputs that contain
2223/// `REL` but no header-typo pattern (the common case for canonical
2224/// `REL TO USA, AUS, GBR` markings) walk the bytes without ever
2225/// allocating.
2226///
2227/// The "token boundary" check (`at_boundary`) prevents matches
2228/// embedded inside a longer alphanumeric run. Without it `XREL OT`
2229/// would match the substring `REL OT` even though the leading `X`
2230/// makes the whole thing a single 6-character token, not a `REL`
2231/// header at all.
2232fn try_rel_to_header_normalize(text: &str) -> Option<String> {
2233 let bytes = text.as_bytes();
2234 let mut result: Option<String> = None;
2235 let mut last_copied: usize = 0;
2236 let mut i = 0;
2237
2238 while i < bytes.len() {
2239 let at_boundary =
2240 i == 0 || matches!(bytes[i - 1], b'/' | b'(' | b' ' | b'\t' | b'\n' | b'\r');
2241
2242 if at_boundary && i + 7 <= bytes.len() {
2243 let window = &bytes[i..i + 7];
2244 // Pattern A (transposition): `REL OT ` → `REL TO `.
2245 // Pattern B (token-boundary): `RELT O ` → `REL TO `.
2246 // Both patterns are exactly 7 bytes; the same 7-byte
2247 // window is compared against each full literal
2248 // explicitly, so a single window read covers both.
2249 if window == b"REL OT " || window == b"RELT O " {
2250 let r = result.get_or_insert_with(|| String::with_capacity(text.len()));
2251 r.push_str(&text[last_copied..i]);
2252 r.push_str("REL TO ");
2253 last_copied = i + 7;
2254 i = last_copied;
2255 continue;
2256 }
2257 }
2258
2259 let ch = text[i..]
2260 .chars()
2261 .next()
2262 .expect("byte index must remain on a char boundary");
2263 i += ch.len_utf8();
2264 }
2265
2266 result.map(|mut r| {
2267 r.push_str(&text[last_copied..]);
2268 r
2269 })
2270}
2271
2272/// Patterns 3 and 4 — entry-level normalization within REL TO blocks.
2273///
2274/// Scans `text` for each `REL TO ` substring and processes the
2275/// comma-separated entries that follow until the next `//` (or end of
2276/// text). Two patterns apply per entry pair:
2277///
2278/// - **Token-boundary** — within a single entry, `<single-upper> <two-upper>`
2279/// is replaced with the joined 3-letter trigraph when the join is a
2280/// known trigraph and the 1-letter prefix alone is not.
2281///
2282/// - **Comma misplacement** — across an entry pair,
2283/// `<2-upper>,<1-upper><space>...` (entry N ends with two letters,
2284/// entry N+1 starts with one letter followed by a space and then
2285/// content) is replaced with `<3-upper joined>,` and the leading
2286/// character is stripped from entry N+1, when the join is a known
2287/// trigraph and the 2-letter prefix alone is not. The space guard
2288/// (the 1-upper must be followed by ASCII space) is what
2289/// distinguishes the misplacement shape from a legitimate
2290/// shorter-than-3 entry typo and is enforced by `fix_rel_to_block`.
2291///
2292/// Both patterns require the corrected output to be a known trigraph
2293/// (`CapcoTokenSet::is_trigraph`). The trigraph dictionary is the
2294/// arbiter of "valid country code" — no fuzzy guessing.
2295fn try_rel_to_entry_normalize(text: &str) -> Option<String> {
2296 // Cheap pre-check: entry-level patterns 3 and 4 only fire inside a
2297 // `REL TO ` block, so `apply_rel_to_entry_pass` cannot match
2298 // without that anchor. Skip the `to_owned()` allocation entirely
2299 // when the input has no `REL TO ` substring (the common path for
2300 // canonical inputs and for non-REL-TO segments of the broader
2301 // structural-repair caller).
2302 if !text.contains("REL TO ") {
2303 return None;
2304 }
2305
2306 let token_set = CapcoTokenSet;
2307 let mut any_change = false;
2308 let mut current: Option<String> = None;
2309
2310 // Loop until no further fix fires. Most inputs converge in one
2311 // pass; the loop guards against the rare case where fixing one
2312 // pattern exposes another (e.g., a comma misplacement that ends a
2313 // block adjacent to a token-boundary pattern in the next entry).
2314 // First iteration borrows `text`; subsequent iterations re-pass the
2315 // previously rewritten `String` so the only allocation is the one
2316 // produced by the first successful fix (and any further passes).
2317 loop {
2318 let input: &str = current.as_deref().unwrap_or(text);
2319 match apply_rel_to_entry_pass(input, &token_set) {
2320 Some(rewritten) => {
2321 current = Some(rewritten);
2322 any_change = true;
2323 }
2324 None => break,
2325 }
2326 }
2327
2328 if any_change { current } else { None }
2329}
2330
2331/// Single pass of REL TO entry normalization. Returns the rewritten
2332/// text on first fix, or `None` if no pattern matched.
2333fn apply_rel_to_entry_pass(text: &str, token_set: &CapcoTokenSet) -> Option<String> {
2334 let mut search_start = 0;
2335 while let Some(rel_pos) = text[search_start..].find("REL TO ") {
2336 let header_end = search_start + rel_pos + "REL TO ".len();
2337 // Block ends at the next `//` (start of next category) or end
2338 // of text. The `//` boundary is always 2 bytes; we exclude it
2339 // from the block contents.
2340 let block_end = text[header_end..]
2341 .find("//")
2342 .map(|p| header_end + p)
2343 .unwrap_or(text.len());
2344 let block = &text[header_end..block_end];
2345
2346 if let Some((rel_local_offset, fixed_block)) = fix_rel_to_block(block, token_set) {
2347 let mut result = String::with_capacity(text.len());
2348 result.push_str(&text[..header_end]);
2349 result.push_str(&fixed_block);
2350 result.push_str(&text[block_end..]);
2351 // Suppress unused-variable warning when the helper returns
2352 // a fix — `rel_local_offset` is reserved for a future
2353 // localized-emit optimization but not needed today since
2354 // we rebuild the full text.
2355 let _ = rel_local_offset;
2356 return Some(result);
2357 }
2358
2359 search_start = block_end;
2360 }
2361 None
2362}
2363
2364/// Walk the comma-separated entries of one REL TO block; apply
2365/// pattern 3 (token-boundary inside an entry) and pattern 4 (comma
2366/// misplaced between adjacent entries) on first match. Returns
2367/// `(local_offset, rewritten_block)` for the first fix, or `None` if
2368/// the block is already canonical.
2369///
2370/// `local_offset` is the byte offset within `block` where the fix
2371/// landed; reserved for future localized emit optimizations.
2372fn fix_rel_to_block(block: &str, token_set: &CapcoTokenSet) -> Option<(usize, String)> {
2373 // Collect entries with their byte offsets within the block so a
2374 // fix can be emitted with precise positioning.
2375 let mut entries: Vec<(usize, &str)> = Vec::new();
2376 let mut cursor = 0;
2377 for entry in block.split(',') {
2378 entries.push((cursor, entry));
2379 cursor += entry.len() + 1; // +1 for the comma separator
2380 }
2381
2382 // Pattern 3: token-boundary inside a single entry.
2383 // `<lead-ws><single-upper> <two-upper><trail-ws>` → joined trigraph.
2384 for (entry_offset, entry) in &entries {
2385 let trimmed = entry.trim();
2386 // Need exactly 4 chars: `A US` shape. Anything else (3, 5, etc.)
2387 // is either canonical or a different recovery shape.
2388 if trimmed.len() != 4 {
2389 continue;
2390 }
2391 let bytes = trimmed.as_bytes();
2392 if !bytes[0].is_ascii_uppercase()
2393 || bytes[1] != b' '
2394 || !bytes[2].is_ascii_uppercase()
2395 || !bytes[3].is_ascii_uppercase()
2396 {
2397 continue;
2398 }
2399 let joined = format!(
2400 "{}{}{}",
2401 bytes[0] as char, bytes[2] as char, bytes[3] as char
2402 );
2403 if !token_set.is_trigraph(&joined) {
2404 continue;
2405 }
2406 // Defensive: don't fire if the 1-letter prefix is itself a
2407 // trigraph (no real CAPCO trigraph is 1-letter, but guard
2408 // anyway against future schema changes).
2409 let one_letter = std::str::from_utf8(&bytes[..1]).expect("ASCII upper");
2410 if token_set.is_trigraph(one_letter) {
2411 continue;
2412 }
2413
2414 // Rebuild the block: replace the 4-char entry contents with
2415 // the 3-char joined trigraph, preserving any leading/trailing
2416 // whitespace inside the entry.
2417 // entry = lead_ws + trimmed + trail_ws; replace `trimmed`
2418 // (4 chars) with `joined` (3 chars), preserving the
2419 // surrounding whitespace verbatim.
2420 let lead_ws_len = entry.len() - entry.trim_start().len();
2421 let mut rewritten_entry = String::with_capacity(entry.len() - 1);
2422 rewritten_entry.push_str(&entry[..lead_ws_len]);
2423 rewritten_entry.push_str(&joined);
2424 rewritten_entry.push_str(&entry[lead_ws_len + trimmed.len()..]);
2425
2426 let mut result = String::with_capacity(block.len());
2427 result.push_str(&block[..*entry_offset]);
2428 result.push_str(&rewritten_entry);
2429 result.push_str(&block[*entry_offset + entry.len()..]);
2430 return Some((*entry_offset, result));
2431 }
2432
2433 // Pattern 4: comma misplaced between entries.
2434 // entries[i] = `<2-upper>` (trimmed) AND
2435 // entries[i+1] = `<1-upper><space><rest>` (trimmed) AND
2436 // joined 3-letter is a trigraph AND 2-letter alone is not.
2437 for i in 0..entries.len().saturating_sub(1) {
2438 let (left_off, left_entry) = &entries[i];
2439 let (right_off, right_entry) = &entries[i + 1];
2440 let left_trim = left_entry.trim();
2441 let right_trim_start = right_entry.trim_start();
2442 if left_trim.len() != 2 || !left_trim.chars().all(|c| c.is_ascii_uppercase()) {
2443 continue;
2444 }
2445 let right_bytes = right_trim_start.as_bytes();
2446 if right_bytes.len() < 2 || !right_bytes[0].is_ascii_uppercase() || right_bytes[1] != b' ' {
2447 continue;
2448 }
2449 let joined = format!("{}{}", left_trim, right_bytes[0] as char);
2450 if !token_set.is_trigraph(&joined) {
2451 continue;
2452 }
2453 if token_set.is_trigraph(left_trim) {
2454 // 2-letter alone is already a trigraph (e.g., EU); the
2455 // comma might be intentional. Skip.
2456 continue;
2457 }
2458
2459 // Rebuild: left entry becomes `<lead-ws><joined>`, right
2460 // entry becomes ` <rest-after-first-char-and-space>` (we
2461 // strip the first char and the space, prepend a single
2462 // canonical space).
2463 let left_lead = left_entry.len() - left_entry.trim_start().len();
2464 let mut new_left = String::with_capacity(left_entry.len() + 1);
2465 new_left.push_str(&left_entry[..left_lead]);
2466 new_left.push_str(&joined);
2467
2468 let right_lead = right_entry.len() - right_trim_start.len();
2469 // Skip the first char and the following space.
2470 let after_first = &right_trim_start[2..];
2471 let mut new_right = String::with_capacity(right_entry.len());
2472 new_right.push_str(&right_entry[..right_lead]);
2473 new_right.push(' ');
2474 new_right.push_str(after_first);
2475
2476 // Emit: block[..left_off] + new_left + ',' + new_right + block[right_off+right_entry.len()..]
2477 let mut result = String::with_capacity(block.len() + 1);
2478 result.push_str(&block[..*left_off]);
2479 result.push_str(&new_left);
2480 result.push(',');
2481 result.push_str(&new_right);
2482 result.push_str(&block[*right_off + right_entry.len()..]);
2483 return Some((*left_off, result));
2484 }
2485
2486 None
2487}
2488
2489// ---------------------------------------------------------------------------
2490// REL TO trigraph fuzzy expansion (issue #233)
2491// ---------------------------------------------------------------------------
2492
2493/// Emit one canonical-byte alternate per fuzzy candidate for each
2494/// unknown 3- or 4-char REL TO entry.
2495///
2496/// The standard fuzzy path in [`fuzzy_correct_tokens`] operates against
2497/// the [`CapcoTokenSet::correction_vocab`] slice, which deliberately
2498/// excludes country trigraphs (the design comment on `ALL_CVE_TOKENS`
2499/// in `crates/ism/build.rs` calls this out — country codes live
2500/// exclusively in [`marque_ism::TRIGRAPHS`] and are reached through
2501/// [`CapcoTokenSet::is_trigraph`]). So a typo'd 3-char REL TO entry
2502/// like `USB` gets no correction from the standard pass — there's
2503/// nothing in the vocab to match it against. The strict parser then
2504/// emits a `TokenKind::Unknown` for the entry (issue #233 change in
2505/// `parse_rel_to_with_spans`), and the dispatcher's step 3a rejects
2506/// the "drop USB" candidate.
2507///
2508/// With the original candidate filtered out, this function provides
2509/// the alternates the dispatcher chooses between: it walks each
2510/// `REL TO ` block in `text`, finds 3- or 4-char comma-separated
2511/// entries that aren't already valid trigraphs/tetragraphs, asks the
2512/// trigraph-vocab matcher for all candidates within the edit-distance
2513/// bound, and emits one alternate text per candidate (with the
2514/// substitution applied in-place).
2515///
2516/// Each emitted alternate carries an `EditDistance1` /
2517/// `EditDistance2` feature (paired with the candidate's distance) so
2518/// the audit trail records the fuzzy work. The caller pushes a
2519/// `BaseRateCommonMarking` feature acknowledging the trigraph-prior
2520/// contribution. The decoder's `score_candidate` later sums the
2521/// trigraph-prior contribution over the parsed `rel_to` slice; the
2522/// popular-vs-rare log-prior delta (e.g., `log_prior(USA) -
2523/// log_prior(UZB)` ≈ +7 nats) decides which alternate wins the
2524/// `UNAMBIGUOUS_LOG_MARGIN` (~1.6 nat) contest.
2525///
2526/// **Scope**: 3-char (trigraph) and 4-char (tetragraph) ASCII
2527/// uppercase entries only. Two-letter entries (`EU`) are below
2528/// `MIN_FUZZY_LEN`; longer multi-char entries (`AUSTRALIA_GROUP`)
2529/// have low fuzzy-tie risk because their lengths rarely collide.
2530/// Only fires when the entry token is NOT already a valid
2531/// trigraph/tetragraph — so `AUT`, `UZB`, `FVEY`, `ACGU`, `ISAF`
2532/// in legitimate use pass through unchanged. 4-char scope added to
2533/// recover coalition-shorthand typos (`FVYE` → `FVEY`,
2534/// `SGAF` → `ISAF`); issue #246.
2535///
2536/// **CAPCO authority**: REL TO syntax is defined in CAPCO-2016 §H.8.
2537/// The trigraph/tetragraph dictionary itself comes from the ODNI CVE
2538/// schema in `CVEnumISMCATRelTo.xsd`, baked into
2539/// [`CapcoTokenSet::is_trigraph`] and into the
2540/// [`marque_ism::TRIGRAPHS`] slice this function fuzzy-matches against.
2541fn try_rel_to_fuzzy_trigraph_candidates(
2542 text: &str,
2543 trigraph_matcher: &FuzzyVocabMatcher<'_>,
2544) -> Vec<(String, FeatureEntry)> {
2545 let token_set = CapcoTokenSet;
2546 let mut out: Vec<(String, FeatureEntry)> = Vec::new();
2547
2548 let mut search_start = 0;
2549 while let Some(rel_pos) = text[search_start..].find("REL TO ") {
2550 let header_end = search_start + rel_pos + "REL TO ".len();
2551 // Block ends at the EARLIEST of: `//` (next category), `\n`
2552 // (banner/CAB candidates from `Scanner::scan_banners` arrive
2553 // as full lines, so a REL TO line can have trailing prose
2554 // beyond the marking), or `)` (portion-form close). CAPCO
2555 // §H.8 / §A authority: `//` is the category separator; `,`
2556 // separates entries within the REL TO category itself.
2557 // Mirrors the corpus analyzer's terminator priority in
2558 // `tools/corpus-analysis/analyze.py` (`_extract_rel_to_trigraphs`).
2559 let tail = &text[header_end..];
2560 let block_len = ["//", "\n", ")"]
2561 .iter()
2562 .filter_map(|sep| tail.find(sep))
2563 .min()
2564 .unwrap_or(tail.len());
2565 let block_end = header_end + block_len;
2566 let block = &text[header_end..block_end];
2567
2568 // Walk the comma-separated entries with their byte offsets.
2569 let mut cursor = 0usize;
2570 for entry in block.split(',') {
2571 let entry_start = cursor;
2572 let entry_end = cursor + entry.len();
2573 cursor = entry_end + 1; // skip the comma
2574
2575 let trimmed = entry.trim();
2576 // 3-char (trigraph) or 4-char (tetragraph) ASCII-uppercase
2577 // entries only — see fn doc for scope rationale.
2578 let tlen = trimmed.len();
2579 if (tlen != 3 && tlen != 4) || !trimmed.bytes().all(|b| b.is_ascii_uppercase()) {
2580 continue;
2581 }
2582 // Skip already-valid trigraphs/tetragraphs (the matcher's
2583 // binary search would also short-circuit on a vocab hit, but
2584 // keeping the explicit check means a token like `FVEY`
2585 // appearing legitimately never gets multi-cast).
2586 if token_set.is_trigraph(trimmed) {
2587 continue;
2588 }
2589
2590 // Bypass the standard `MIN_USEFUL_CONFIDENCE` floor:
2591 // for a 3-char input, distance-2 corrections sit at
2592 // confidence 0.40, below the default 0.45 cutoff that
2593 // protects the standalone fuzzy path. Issue #233's score-
2594 // time tiebreak (corpus-weighted trigraph priors +
2595 // `UNAMBIGUOUS_LOG_MARGIN`) supplies the safety the
2596 // confidence-floor was substituting for; without lowering
2597 // it here, a typo like `ASU → AUS` (plain Levenshtein
2598 // distance 2) never reaches the scorer.
2599 let mut candidates = trigraph_matcher.correct_all_with_floor(trimmed, 0.0);
2600 if candidates.is_empty() {
2601 continue;
2602 }
2603
2604 // Drop candidates that would duplicate a trigraph already
2605 // present elsewhere in this REL TO block. CAPCO-2016 §H.8
2606 // does not state "no duplicates" as an explicit textual
2607 // prohibition — the REL TO grammar (§A.6 / §H.8 p131-150)
2608 // describes a list of country codes ordered USA-first then
2609 // ascending alphabetic, which structurally implies a set of
2610 // distinct codes but does not forbid repetition in so many
2611 // words. The reason we drop duplicates here is mechanical,
2612 // not citational: the bag-of-tokens scorer happens to
2613 // *reward* duplicates (each instance adds its log-prior
2614 // again), so without this filter an ambiguous typo
2615 // adjacent to a popular trigraph could collapse to
2616 // "REL TO USA, USA, GBR" because USA's log-prior
2617 // contribution is additive. Emitting a duplicate-creating
2618 // candidate would therefore be structurally redundant and
2619 // cause the scorer to erroneously favor it. The block's
2620 // other entries are computed by re-walking
2621 // `block.split(',')` and taking the trigraph form of any
2622 // 3-char ASCII-uppercase entry that's in the CVE
2623 // recognition set.
2624 let other_trigraphs: Vec<&str> = block
2625 .split(',')
2626 .map(str::trim)
2627 .filter(|e| {
2628 let elen = e.len();
2629 (elen == 3 || elen == 4)
2630 && e.bytes().all(|b| b.is_ascii_uppercase())
2631 && *e != trimmed
2632 && token_set.is_trigraph(e)
2633 })
2634 .collect();
2635 candidates.retain(|c| !other_trigraphs.contains(&c.token));
2636 if candidates.is_empty() {
2637 continue;
2638 }
2639
2640 // Rank candidates by (distance, then country-code
2641 // log-prior). The plain Levenshtein hits for a 3-char
2642 // input often produce 20+ distance-2 candidates (every
2643 // other 3-char trigraph that shares one letter). Without
2644 // a prior-rank pre-filter, the K=16 attempt cap upstream
2645 // gets exhausted by low-prior alternates and the
2646 // high-prior ones get dropped. Sorting by (distance asc,
2647 // log-prior desc) keeps the most plausible candidates
2648 // first; we cap at TRIGRAPH_FUZZY_TOP_K per ambiguous
2649 // entry to bound the candidate-set growth.
2650 //
2651 // The cap value (4) is sized so a single ambiguous entry
2652 // doesn't crowd out the other decoder paths
2653 // (`fuzzy_corrected`, reorder, delimiter-insert, etc.):
2654 // 4 alternates ≤ K_MAX_CANDIDATES (8) leaves room for
2655 // the standard candidates the dispatcher also needs.
2656 const TRIGRAPH_FUZZY_TOP_K: usize = 4;
2657 candidates.sort_by(|a, b| {
2658 a.distance.cmp(&b.distance).then_with(|| {
2659 let pa = marque_capco::priors::country_code_log_prior(a.token)
2660 .unwrap_or(f32::NEG_INFINITY);
2661 let pb = marque_capco::priors::country_code_log_prior(b.token)
2662 .unwrap_or(f32::NEG_INFINITY);
2663 pb.total_cmp(&pa)
2664 })
2665 });
2666 candidates.truncate(TRIGRAPH_FUZZY_TOP_K);
2667
2668 for cand in &candidates {
2669 // Reconstruct the full `text` with the entry replaced.
2670 // The 3-char trimmed sub-slice within the entry
2671 // preserves any surrounding whitespace.
2672 let lead_ws_len = entry.len() - entry.trim_start().len();
2673 let trail_ws_len = entry.len() - entry.trim_end().len();
2674 let mut rewritten_entry = String::with_capacity(entry.len());
2675 rewritten_entry.push_str(&entry[..lead_ws_len]);
2676 rewritten_entry.push_str(cand.token);
2677 rewritten_entry.push_str(&entry[entry.len() - trail_ws_len..]);
2678
2679 let mut alt = String::with_capacity(text.len());
2680 alt.push_str(&text[..header_end + entry_start]);
2681 alt.push_str(&rewritten_entry);
2682 alt.push_str(&text[header_end + entry_end..]);
2683
2684 // `FeatureId` is a closed audit-schema enum (see
2685 // `crates/rules/src/confidence.rs` and `MARQUE_AUDIT_SCHEMA`);
2686 // pair each (id, delta) directly off `cand.distance`
2687 // so the match is total over the only two outcomes
2688 // `cand.distance` can take here. The standalone fuzzy
2689 // matcher caps results at `MAX_EDIT_DISTANCE = 2`.
2690 let entry = if cand.distance <= 1 {
2691 FeatureEntry {
2692 id: FeatureId::EditDistance1,
2693 delta: -0.5,
2694 }
2695 } else {
2696 FeatureEntry {
2697 id: FeatureId::EditDistance2,
2698 delta: -1.2,
2699 }
2700 };
2701 out.push((alt, entry));
2702 }
2703 }
2704
2705 search_start = block_end;
2706 }
2707
2708 out
2709}
2710
2711// ---------------------------------------------------------------------------
2712// REL TO USA-injection for short first entries (issue #234 PR-B)
2713// ---------------------------------------------------------------------------
2714
2715/// Emit one canonical-byte alternate per REL TO block whose first
2716/// entry is a 1- or 2-character ASCII-uppercase token AND USA is not
2717/// otherwise present in the block. The alternate replaces that short
2718/// first entry with `USA`.
2719///
2720/// **Why complement to PR-A.** Issue #233's
2721/// [`try_rel_to_fuzzy_trigraph_candidates`] handles 3-char REL TO
2722/// entries: an unknown trigraph-shaped token gets fuzzy-matched
2723/// against the [`marque_ism::TRIGRAPHS`] vocabulary, and corpus-
2724/// weighted log-priors break ties at score time. That path
2725/// deliberately skips entries below `MIN_FUZZY_LEN = 3` (see the
2726/// `if trimmed.len() != 3` guard in `try_rel_to_fuzzy_trigraph_candidates`)
2727/// because `phf`-style fuzzy matching is unreliable on inputs that
2728/// short — a 2-char input is edit-distance-1 from many distinct
2729/// trigraphs and the mapper has no signal to break the tie.
2730///
2731/// For REL TO specifically, the §H.8 p150–151 grammar gives us a
2732/// stronger signal that fuzzy-matching cannot exploit: **USA must
2733/// always appear first**. So when we see a REL TO block whose first
2734/// entry is a 1- or 2-character ASCII-uppercase token, the most
2735/// likely intent — far above any other 3-char trigraph — is that
2736/// the user typed USA and dropped one or two characters. The fixture
2737/// at `tests/fixtures/mangled/typo/ad2bcfe3ac0b0765.json`
2738/// (`REL TO SA, AUS, GBR` → `REL TO USA, AUS, GBR`) is the canonical
2739/// case: `SA` is shape-incompatible with PR-A's 3-char floor, so
2740/// without this complementary path the decoder produces zero
2741/// candidates and the fixture fails recovery.
2742///
2743/// **CAPCO authority**: the USA-first invariant is CAPCO-2016 §H.8
2744/// p151: "After 'USA', list the required one or more trigraph country
2745/// codes in alphabetical order." E020 enforces that invariant at the
2746/// rule layer (via the `marque-capco`-private `canonicalize_trigraph_list`
2747/// helper). This decoder path operates one stage earlier — pre-strict-
2748/// parse, on raw text — so it does NOT call the rule-layer helper; it
2749/// emits a candidate text and lets the downstream pipeline (strict
2750/// parse + E020) verify and re-canonicalize as needed.
2751///
2752/// **Scope and guards** (mirrors PR-A's design):
2753///
2754/// - Fires only when the first entry's trimmed length is 1 or 2 ASCII
2755/// uppercase bytes (3-char entries belong to PR-A's domain).
2756/// - Skips when USA is already present elsewhere in the block — that
2757/// case isn't a USA-typo, it's an unrelated short prefix the user
2758/// may have meant differently. The block stays as-is.
2759/// - Skips when the block has fewer than two entries — a single
2760/// short entry plus nothing else doesn't fit the §H.8 p151
2761/// "USA + trigraph list" shape.
2762/// - Emits the substitution transform only — full canonicalization
2763/// (USA first, remaining trigraphs alphabetical, no duplicates) is
2764/// downstream. If the original list's tail (other than the
2765/// corrupted first entry) wasn't already alphabetical, E020 will
2766/// fire on the post-decode text and produce its own fix; if the
2767/// injection produced a duplicate (USA was already present in the
2768/// block under a different shape), the `already_has_usa` guard
2769/// above suppresses emit. Keeping the decoder text-level (no
2770/// `marque-capco` imports) avoids re-entering the rule layer
2771/// mid-recognition while preserving the single-source-of-truth
2772/// property — the canonical ordering rule lives in `marque-capco`,
2773/// and the decoder defers to whatever it produces post-parse.
2774/// - Audit signal: each candidate carries
2775/// [`FeatureId::BaseRateCommonMarking`] as provenance only, with
2776/// zero delta. This records that USA is the dominant trigraph in
2777/// the corpus prior without changing score or double-counting that
2778/// prior in the posterior. Reusing `BaseRateCommonMarking` (vs
2779/// introducing a new variant) keeps the audit schema closed —
2780/// `MARQUE_AUDIT_SCHEMA` stays at `marque-mvp-2`.
2781fn try_rel_to_usa_injection_candidates(text: &str) -> Vec<(String, FeatureEntry)> {
2782 let mut out: Vec<(String, FeatureEntry)> = Vec::new();
2783
2784 let mut search_start = 0;
2785 while let Some(rel_pos) = text[search_start..].find("REL TO ") {
2786 let header_end = search_start + rel_pos + "REL TO ".len();
2787 // Block ends at the EARLIEST of: `//` (next category), `\n`
2788 // (banner/CAB candidates from `Scanner::scan_banners` arrive
2789 // as full lines), or `)` (portion-form close). CAPCO §H.8 /
2790 // §A authority: `//` is the category separator; `,` separates
2791 // entries within the REL TO category itself. Mirrors the
2792 // terminator priority in `try_rel_to_fuzzy_trigraph_candidates`
2793 // and the corpus analyzer's `_extract_rel_to_trigraphs`.
2794 let tail = &text[header_end..];
2795 let block_len = ["//", "\n", ")"]
2796 .iter()
2797 .filter_map(|sep| tail.find(sep))
2798 .min()
2799 .unwrap_or(tail.len());
2800 let block_end = header_end + block_len;
2801 let block = &text[header_end..block_end];
2802
2803 // Walk entries with their byte offsets within the block.
2804 // Pre-size from comma count + 1 — typical REL TO blocks have
2805 // 2–6 entries, so this avoids reallocations on the common case.
2806 let entries: Vec<(usize, &str)> = {
2807 let mut v = Vec::with_capacity(block.bytes().filter(|&b| b == b',').count() + 1);
2808 let mut cursor = 0usize;
2809 for entry in block.split(',') {
2810 v.push((cursor, entry));
2811 cursor += entry.len() + 1; // +1 for the comma separator
2812 }
2813 v
2814 };
2815 if entries.len() < 2 {
2816 // Single-entry block: doesn't match the §H.8 p151
2817 // "USA + trigraph list" shape we're recovering.
2818 search_start = block_end;
2819 continue;
2820 }
2821
2822 // First entry is the candidate USA-typo position. The
2823 // structural guard is shape-only — len ∈ {1, 2}, all ASCII
2824 // uppercase. 3-char entries fall through to PR-A. Length 0
2825 // (e.g., a leading comma) is already filtered.
2826 let (first_entry_offset, first_entry) = entries[0];
2827 let trimmed = first_entry.trim();
2828 let is_short =
2829 (1..=2).contains(&trimmed.len()) && trimmed.bytes().all(|b| b.is_ascii_uppercase());
2830 if !is_short {
2831 search_start = block_end;
2832 continue;
2833 }
2834
2835 // Skip if USA is already present elsewhere in the block —
2836 // a USA-injection candidate would create a duplicate, which
2837 // E052 (issue #234 PR-B) would then need to dedup. Short-
2838 // circuit here rather than emit-and-redup.
2839 let already_has_usa = entries.iter().skip(1).any(|(_, e)| e.trim() == "USA");
2840 if already_has_usa {
2841 search_start = block_end;
2842 continue;
2843 }
2844
2845 // Build the substituted text. Preserve the entry's
2846 // surrounding whitespace (lead/trail) so the splice
2847 // round-trips through the strict parser the same way the
2848 // original would have.
2849 let lead_ws_len = first_entry.len() - first_entry.trim_start().len();
2850 let trail_ws_len = first_entry.len() - first_entry.trim_end().len();
2851 let mut rewritten_entry = String::with_capacity(first_entry.len() + 3);
2852 rewritten_entry.push_str(&first_entry[..lead_ws_len]);
2853 rewritten_entry.push_str("USA");
2854 rewritten_entry.push_str(&first_entry[first_entry.len() - trail_ws_len..]);
2855
2856 let mut alt = String::with_capacity(text.len() + 3);
2857 alt.push_str(&text[..header_end + first_entry_offset]);
2858 alt.push_str(&rewritten_entry);
2859 alt.push_str(&text[header_end + first_entry_offset + first_entry.len()..]);
2860
2861 // Audit-only provenance. The load-bearing scoring lives in
2862 // `score_candidate`, which sums `country_code_log_prior(USA)`
2863 // — already an extreme positive in the baked corpus prior —
2864 // over the parsed `rel_to` slice and is what carries the
2865 // candidate to victory. The `BaseRateCommonMarking` entry
2866 // here records the prior's contribution in the audit log
2867 // without double-counting it in the decoder's score, mirror-
2868 // ing PR-A's trigraph-prior treatment (delta = 0.0).
2869 let entry = FeatureEntry {
2870 id: FeatureId::BaseRateCommonMarking,
2871 delta: 0.0,
2872 };
2873 out.push((alt, entry));
2874
2875 search_start = block_end;
2876 }
2877
2878 out
2879}
2880
2881// ---------------------------------------------------------------------------
2882// SCI delimiter recovery (issue #198 — #133 PR 10)
2883// ---------------------------------------------------------------------------
2884
2885/// SCI delimiter recovery preprocessing — issue #198, #133 PR 10.
2886///
2887/// Repairs three classes of SCI delimiter typos against the closed
2888/// CVE vocabulary in `CVEnumISMSCIControls.xml`. Vocabulary checks
2889/// dispatch through the build-time-generated [`SciControlBare::parse`]
2890/// (bare control systems) and [`SciControl::parse`] (the full CVE set
2891/// including all registered control-compartment compounds), so the
2892/// repair surface tracks ODNI schema updates automatically — no
2893/// hand-maintained vocabulary slice to drift out of sync per
2894/// Constitution IV (Layer 1 generated predicates):
2895///
2896/// - **Pattern A (concatenated compound)**: a token equal to a compound
2897/// with the hyphen removed → canonical hyphenated form. `HCSP →
2898/// HCS-P`, `SIG → SI-G`, `TKKAND → TK-KAND`, etc.
2899/// - **Pattern B (concatenated bare control systems)**: a token of
2900/// length 4–6 that splits cleanly into two bare control systems →
2901/// slash-joined form (`SITK → SI/TK`, `HCSSI → HCS/SI`) per §A.6
2902/// p16 and the `TOP SECRET//ANB/SI/TK/XNB//NOFORN` example on p194.
2903/// Ambiguous splits bail out — see [`repair_sci_token`] for the
2904/// guard.
2905/// - **Pattern C (wrong delimiter)**: a token of the form
2906/// `<bare_cs>-<bare_cs>` that is NOT itself a registered compound →
2907/// slash-joined form. `SI-TK → SI/TK` (because `SI-TK` is not
2908/// registered), but `SI-G` is left alone (it IS registered — `-` is
2909/// the correct control-compartment separator per §A.6 p16).
2910///
2911/// **Out of scope** — sub-compartment fuzzy recovery (`ABCE → ABCD`),
2912/// unregistered-compartment recovery, and any rewrite that would
2913/// require fuzz-correcting against agency-assigned codewords. Those
2914/// require operator-supplied vocab (issue #180) — the engine cannot
2915/// invent identifiers it doesn't know are valid (Constitution VIII).
2916///
2917/// **Architectural shape** mirrors `try_rel_to_structural_repair`
2918/// (PR 9, #190): runs as preprocessing on the input string before
2919/// per-token fuzzy correction, returns `Some(repaired)` only when at
2920/// least one repair fired. The caller pushes a `BaseRateCommonMarking`
2921/// feature onto `delim_features` so every candidate derived from the
2922/// repaired text inherits the audit trace.
2923///
2924/// **Allocation behavior**: short-circuits without allocation when the
2925/// pre-check finds no SCI control system root in the text. The
2926/// per-token walk borrows the input until a fix actually fires.
2927fn try_sci_delimiter_repair(text: &str) -> Option<String> {
2928 if !contains_any_sci_root(text) {
2929 return None;
2930 }
2931
2932 // ASCII-only guard. The SCI control-system vocabulary
2933 // (`SciControlBare::ALL`) and the registered compound names
2934 // (`SciControl::ALL`) are pure ASCII, as are the delimiters this
2935 // function recognizes (`-`, `/`, `(`, `)`, space, tab, newline,
2936 // CR, comma). So any non-ASCII input cannot match any pattern;
2937 // bailing early avoids the byte-vs-char-boundary hazard that
2938 // would otherwise arise from indexing `text` with byte offsets.
2939 if !text.is_ascii() {
2940 return None;
2941 }
2942
2943 let bytes = text.as_bytes();
2944 let mut result: Option<String> = None;
2945 let mut last_copied = 0usize;
2946 let mut i = 0usize;
2947
2948 while i < bytes.len() {
2949 let at_boundary = i == 0
2950 || matches!(
2951 bytes[i - 1],
2952 b'/' | b'(' | b')' | b' ' | b'\t' | b'\n' | b'\r' | b','
2953 );
2954 if !at_boundary {
2955 i += 1;
2956 continue;
2957 }
2958
2959 let token_start = i;
2960 let token_end = bytes[token_start..]
2961 .iter()
2962 .position(|&b| matches!(b, b'/' | b'(' | b')' | b' ' | b'\t' | b'\n' | b'\r' | b','))
2963 .map(|n| token_start + n)
2964 .unwrap_or(bytes.len());
2965
2966 if token_start < token_end {
2967 let token = &text[token_start..token_end];
2968 if let Some(repaired) = repair_sci_token(token) {
2969 let r = result.get_or_insert_with(|| String::with_capacity(text.len()));
2970 r.push_str(&text[last_copied..token_start]);
2971 r.push_str(&repaired);
2972 last_copied = token_end;
2973 }
2974 }
2975
2976 // Advance past the token; the next iteration will re-check the
2977 // boundary before the byte after the delimiter (or terminate at
2978 // end-of-input).
2979 i = token_end + 1;
2980 }
2981
2982 result.map(|mut r| {
2983 r.push_str(&text[last_copied..]);
2984 r
2985 })
2986}
2987
2988/// Cheap pre-check for [`try_sci_delimiter_repair`]: returns true when
2989/// the input contains at least one bare SCI control system identifier
2990/// as a substring. False positives just mean we walk the bytes and
2991/// return `None` — no correctness impact, only a performance
2992/// optimization for the overwhelmingly common case where the input has
2993/// no SCI category at all.
2994fn contains_any_sci_root(text: &str) -> bool {
2995 text.contains("HCS")
2996 || text.contains("KLM")
2997 || text.contains("MVL")
2998 || text.contains("RSV")
2999 || text.contains("BUR")
3000 || text.contains("SI")
3001 || text.contains("TK")
3002}
3003
3004/// Per-token classifier for SCI delimiter repair. Returns the repaired
3005/// token if one of patterns A/B/C matches; otherwise `None`.
3006///
3007/// All vocabulary checks dispatch through the build-time-generated
3008/// [`SciControlBare::parse`] and [`SciControl::parse`] (from
3009/// `marque-ism`'s generated `values.rs`), so the repair surface tracks
3010/// `CVEnumISMSCIControls.xml` automatically. New CVE compounds added
3011/// in a future ODNI schema bump (e.g., a hypothetical `SI-XYZ`) are
3012/// auto-discovered by Pattern A without any code change here.
3013///
3014/// Pattern dispatch order:
3015/// 1. Pattern A (split into bare-CS prefix + suffix; if
3016/// `{prefix}-{suffix}` is a registered CVE value, return it)
3017/// 2. Pattern C (token contains `-`, neither side is a registered
3018/// compound's compartment, both halves are bare CS)
3019/// 3. Pattern B (no `-`, splits into two bare CS, unambiguous)
3020fn repair_sci_token(token: &str) -> Option<String> {
3021 if token.is_empty() {
3022 return None;
3023 }
3024
3025 // ASCII-only guard. The CVE vocabulary is pure ASCII, so a non-
3026 // ASCII token cannot match any pattern; bailing early ensures
3027 // the byte-offset slicing below (`token[..split]`,
3028 // `token[split..]`, `token[..dash_pos]`, `token[dash_pos + 1..]`)
3029 // never lands in the middle of a multi-byte UTF-8 sequence. This
3030 // is a defense-in-depth check — the only production caller
3031 // (`try_sci_delimiter_repair`) already gates on ASCII — but
3032 // keeping it here makes the function's invariant local and
3033 // self-evident for any future caller (e.g., a unit test).
3034 if !token.is_ascii() {
3035 return None;
3036 }
3037
3038 let len = token.len();
3039
3040 // Pattern A — concatenated registered compound. Walk every split
3041 // where the prefix is a bare control system; if `{prefix}-{suffix}`
3042 // is in the CVE vocabulary, return the canonical hyphenated form.
3043 // Bare CS lengths are 2 or 3; suffix length range comes from CVE
3044 // (max compartment-form suffix is 4 chars, e.g. TK-BLFH).
3045 if !token.contains('-') && (3..=8).contains(&len) {
3046 for &split in &[2usize, 3] {
3047 if split >= len {
3048 continue;
3049 }
3050 let prefix = &token[..split];
3051 let suffix = &token[split..];
3052 if SciControlBare::parse(prefix).is_some() {
3053 let canonical = format!("{prefix}-{suffix}");
3054 if SciControl::parse(&canonical).is_some() {
3055 return Some(canonical);
3056 }
3057 }
3058 }
3059 }
3060
3061 // Pattern C — wrong delimiter (`-` between two bare CS). Skip if
3062 // the whole token is itself a registered CVE compound.
3063 if let Some(dash_pos) = token.find('-') {
3064 if SciControl::parse(token).is_some() {
3065 return None;
3066 }
3067 let prefix = &token[..dash_pos];
3068 let suffix = &token[dash_pos + 1..];
3069 if SciControlBare::parse(prefix).is_some() && SciControlBare::parse(suffix).is_some() {
3070 return Some(format!("{prefix}/{suffix}"));
3071 }
3072 return None;
3073 }
3074
3075 // Pattern B — concatenated bare control systems (no delimiter).
3076 // Bare CS lengths are 2 or 3; the concatenation is therefore in
3077 // [4..=6]. Try splits at positions 2 and 3 (the only split points
3078 // that can yield two valid bare-CS halves) and require an
3079 // unambiguous match.
3080 if !(4..=6).contains(&len) {
3081 return None;
3082 }
3083 let mut found: Option<(&str, &str)> = None;
3084 for &split in &[2usize, 3] {
3085 if split >= len {
3086 continue;
3087 }
3088 let suffix_len = len - split;
3089 if !(2..=3).contains(&suffix_len) {
3090 continue;
3091 }
3092 let prefix = &token[..split];
3093 let suffix = &token[split..];
3094 if SciControlBare::parse(prefix).is_some() && SciControlBare::parse(suffix).is_some() {
3095 if found.is_some() {
3096 return None;
3097 }
3098 found = Some((prefix, suffix));
3099 }
3100 }
3101 found.map(|(p, s)| format!("{p}/{s}"))
3102}
3103
3104// ---------------------------------------------------------------------------
3105// Token reordering
3106// ---------------------------------------------------------------------------
3107
3108/// Try to produce a canonical-order rewrite of `text`.
3109///
3110/// The CAPCO category order is: classification → SCI → SAR → dissem.
3111/// If the observed segments are out of order — e.g., `NOFORN//SECRET`
3112/// with dissem first — this helper swaps them into the canonical
3113/// order. Returns `None` when the input is already in canonical order
3114/// or when reordering doesn't apply (CAB lines, single-segment input).
3115fn try_canonical_reorder(text: &str) -> Option<String> {
3116 // Only banner/portion-shaped input (contains `//`) is reorderable
3117 // with this heuristic. CABs use keyed authority lines, not
3118 // category ordering.
3119 if !text.contains("//") {
3120 return None;
3121 }
3122
3123 // Portion form: `(C//NF)` — strip the surrounding parens for
3124 // reasoning, re-wrap at emit.
3125 let (prefix, body, suffix) = if text.starts_with('(') && text.ends_with(')') {
3126 ("(", &text[1..text.len() - 1], ")")
3127 } else {
3128 ("", text, "")
3129 };
3130
3131 let segments: Vec<&str> = body.split("//").collect();
3132 if segments.len() < 2 {
3133 return None;
3134 }
3135
3136 // Classify each segment by its dominant category. We only
3137 // reorder when exactly one segment is classification-dominant
3138 // and at least one other is dissem-dominant — otherwise the
3139 // input is too ambiguous for a clean swap.
3140 let mut class_segments: Vec<&str> = Vec::new();
3141 let mut dissem_segments: Vec<&str> = Vec::new();
3142 let mut other_segments: Vec<&str> = Vec::new();
3143 for seg in &segments {
3144 let seg = seg.trim();
3145 if seg.is_empty() {
3146 continue;
3147 }
3148 match classify_segment(seg) {
3149 SegmentClass::Classification => class_segments.push(seg),
3150 SegmentClass::Dissem => dissem_segments.push(seg),
3151 SegmentClass::Other => other_segments.push(seg),
3152 }
3153 }
3154
3155 if class_segments.is_empty() {
3156 return None;
3157 }
3158
3159 // Detect non-US markings: any classification segment is a NATO,
3160 // JOINT, or FGI classification (not a US classification level).
3161 let is_non_us = class_segments
3162 .iter()
3163 .any(|s| is_non_us_classification_segment(s));
3164
3165 // Already-canonical check: if the classification segment is the
3166 // first non-empty segment, no reorder is needed.
3167 // For non-US markings: also require that the body already starts
3168 // with `//` (the empty US classification slot). If the class is
3169 // first but the `//` prefix is absent, fall through to add it.
3170 if let Some(first) = segments.iter().find(|s| !s.trim().is_empty()) {
3171 if class_segments.contains(&first.trim()) {
3172 // US: already canonical.
3173 // Non-US: already canonical only when // prefix is present.
3174 if !is_non_us || body.starts_with("//") {
3175 return None;
3176 }
3177 }
3178 }
3179
3180 // Emit: classification → other (SCI/SAR/FGI blocks) → dissem.
3181 let mut ordered: Vec<&str> = Vec::new();
3182 ordered.extend(class_segments);
3183 ordered.extend(other_segments);
3184 ordered.extend(dissem_segments);
3185
3186 let joined = ordered.join("//");
3187
3188 // Non-US canonical form: `//{class}//{others}//{dissems}`. The
3189 // leading `//` represents the empty US classification slot (per
3190 // CAPCO-2016 §A.6) and signals the strict parser to use the
3191 // non-US classification code path.
3192 if is_non_us {
3193 Some(format!("{prefix}//{joined}{suffix}"))
3194 } else {
3195 Some(format!("{prefix}{joined}{suffix}"))
3196 }
3197}
3198
3199/// Which CAPCO category a `//`-separated segment primarily belongs to.
3200///
3201/// A segment is classification-dominant if its first token is a known
3202/// classification level (`U`, `C`, `S`, `TS`, `CONFIDENTIAL`, …).
3203/// Dissem-dominant if its first token is a known dissem control
3204/// (`NOFORN`, `NF`, `ORCON`, …). Otherwise Other (SCI/SAR/FGI
3205/// sub-blocks, REL TO lists, etc.).
3206#[derive(Debug, Clone, Copy, PartialEq, Eq)]
3207enum SegmentClass {
3208 Classification,
3209 Dissem,
3210 Other,
3211}
3212
3213fn classify_segment(seg: &str) -> SegmentClass {
3214 let first_token = seg.split_whitespace().next().unwrap_or("");
3215 // Strip trailing commas.
3216 let first_token = first_token.trim_end_matches(',');
3217 // Single-whitespace-token classifications only. `TOP SECRET` and
3218 // multi-word NATO/JOINT forms are handled by the separate
3219 // starts_with branches below.
3220 const CLASSIFICATIONS: &[&str] = &[
3221 "U",
3222 "R",
3223 "C",
3224 "S",
3225 "TS",
3226 "UNCLASSIFIED",
3227 "RESTRICTED",
3228 "CONFIDENTIAL",
3229 "SECRET",
3230 // NATO classification abbreviations (single-token forms).
3231 "NS",
3232 "NC",
3233 "NU",
3234 "CTS",
3235 "CTSA",
3236 "NSAT",
3237 "NCA",
3238 "CTS-B",
3239 "CTS-BALK",
3240 // JOINT classification indicator.
3241 "JOINT",
3242 ];
3243 // Dissemination-control tokens — IC (§H.8) and non-IC (§H.9).
3244 // SCI controls (HCS, SI, TK, and all their sub-compartment forms)
3245 // are NOT in this list — they belong to their own category under
3246 // CAPCO §A.6 and the canonical order places them between
3247 // classification and dissem. Classifying an HCS segment as Dissem
3248 // would drive `try_canonical_reorder` to move it past the dissem
3249 // block, corrupting the rewrite. SCI segments therefore fall
3250 // through to `SegmentClass::Other`, which the reorder helper
3251 // inserts between classification and dissem — the right spot per
3252 // CAPCO-2016 §A.6.
3253 //
3254 // AEA controls (RD, FRD, TFNI, CNWDI, SIGMA) are also omitted —
3255 // they appear between SCI and dissem per §A.6. A pre-check above
3256 // `CLASSIFICATIONS.contains` prevents "RESTRICTED DATA" from being
3257 // mistaken for the NATO RESTRICTED classification.
3258 //
3259 // "REL" is the first token of "REL TO {country-list}" segments.
3260 //
3261 // Non-IC dissem controls (§H.9): portion marks (DS, XD, ND,
3262 // SBU, SBU-NF, LES, LES-NF, SSI) and banner abbreviations
3263 // (LIMDIS, EXDIS, NODIS) are included so reordering places them
3264 // in the dissem block, not the SCI/AEA block (CAPCO-2016 §A.6).
3265 const DISSEMS: &[&str] = &[
3266 // §H.8 IC dissemination controls
3267 "NOFORN", "NF", "ORCON", "OC", "PROPIN", "PR", "IMCON", "IMC", "RELIDO", "RS", "RSEN",
3268 "DSEN", "FISA", "FOUO", "EYES", "REL",
3269 // §H.9 non-IC dissemination controls — portion marks
3270 "DS", "XD", "ND", "SBU", "SBU-NF", "LES", "LES-NF", "SSI",
3271 // §H.9 non-IC dissemination controls — banner abbreviations
3272 "LIMDIS", "EXDIS", "NODIS",
3273 ];
3274 // Pre-check: "RESTRICTED DATA" (AEA marking, §H.6) must not be
3275 // mistaken for the NATO RESTRICTED classification even though
3276 // "RESTRICTED" appears in CLASSIFICATIONS. The bare token
3277 // "RESTRICTED" IS valid as NATO classification; "RESTRICTED DATA"
3278 // and longer AEA forms are not. CAPCO-2016 §H.6 p113.
3279 if first_token == "RESTRICTED" && seg.split_whitespace().nth(1).is_some() {
3280 return SegmentClass::Other;
3281 }
3282 if CLASSIFICATIONS.contains(&first_token) {
3283 SegmentClass::Classification
3284 // Single-token dissem controls and multi-word non-IC long-title forms.
3285 // Multi-word forms cannot be single-token-matched because their first words
3286 // ("LIMITED", "NO", "EXCLUSIVE", "LAW", "SENSITIVE") are too ambiguous;
3287 // they are checked via starts_with here. CAPCO-2016 §H.8–9.
3288 } else if DISSEMS.contains(&first_token)
3289 || (first_token == "LIMITED" && seg.starts_with("LIMITED DISTRIBUTION"))
3290 || (first_token == "NO" && seg.starts_with("NO DISTRIBUTION"))
3291 || (first_token == "EXCLUSIVE" && seg.starts_with("EXCLUSIVE DISTRIBUTION"))
3292 || (first_token == "LAW" && seg.starts_with("LAW ENFORCEMENT SENSITIVE"))
3293 || (first_token == "SENSITIVE"
3294 && (seg.starts_with("SENSITIVE BUT UNCLASSIFIED")
3295 || seg.starts_with("SENSITIVE SECURITY INFORMATION")))
3296 {
3297 SegmentClass::Dissem
3298 } else if (first_token == "TOP" && seg.starts_with("TOP SECRET"))
3299 || (first_token == "COSMIC" && seg.starts_with("COSMIC TOP SECRET"))
3300 || (first_token == "NATO"
3301 && (seg.starts_with("NATO SECRET")
3302 || seg.starts_with("NATO CONFIDENTIAL")
3303 || seg.starts_with("NATO UNCLASSIFIED")
3304 || seg.starts_with("NATO RESTRICTED")))
3305 {
3306 SegmentClass::Classification
3307 } else if CapcoTokenSet.is_trigraph(first_token) {
3308 // FGI pattern: {registered country trigraph} {classification level}.
3309 // Validated against the authoritative CVEnumISMCATRelTo vocabulary so
3310 // typos like "OTP" (→ TOP) don't get mistaken for FGI country codes.
3311 let second = seg.split_whitespace().nth(1).unwrap_or("");
3312 let second = second.trim_end_matches(',');
3313 if matches!(
3314 second,
3315 "U" | "R"
3316 | "C"
3317 | "S"
3318 | "TS"
3319 | "UNCLASSIFIED"
3320 | "RESTRICTED"
3321 | "CONFIDENTIAL"
3322 | "SECRET"
3323 ) || (second == "TOP"
3324 && seg
3325 .split_whitespace()
3326 .nth(2)
3327 .is_some_and(|t| t.trim_end_matches(',') == "SECRET"))
3328 {
3329 SegmentClass::Classification
3330 } else {
3331 SegmentClass::Other
3332 }
3333 } else {
3334 SegmentClass::Other
3335 }
3336}
3337
3338/// Returns true when `seg` is a non-US classification segment: a NATO
3339/// classification abbreviation, a JOINT classification phrase, or an FGI
3340/// `{trigraph} {level}` pattern.
3341///
3342/// Used by `try_canonical_reorder` to decide whether the reordered output
3343/// needs a leading `//` (the empty US classification slot that signals the
3344/// strict parser to take the non-US code path).
3345fn is_non_us_classification_segment(seg: &str) -> bool {
3346 const NATO_ABBREVS: &[&str] = &[
3347 "NS", "NC", "NU", "CTS", "CTSA", "NSAT", "NCA", "CTS-B", "CTS-BALK",
3348 ];
3349 let mut tokens = seg.split_whitespace();
3350 let first = tokens.next().unwrap_or("");
3351 let first = first.trim_end_matches(',');
3352 if NATO_ABBREVS.contains(&first) {
3353 return true;
3354 }
3355 if first == "JOINT" {
3356 return true;
3357 }
3358 if first == "COSMIC" && seg.starts_with("COSMIC TOP SECRET") {
3359 return true;
3360 }
3361 if first == "NATO"
3362 && (seg.starts_with("NATO SECRET")
3363 || seg.starts_with("NATO CONFIDENTIAL")
3364 || seg.starts_with("NATO UNCLASSIFIED")
3365 || seg.starts_with("NATO RESTRICTED"))
3366 {
3367 return true;
3368 }
3369 // FGI: {registered country trigraph} {classification level}.
3370 // Validated against the authoritative CVEnumISMCATRelTo vocabulary so
3371 // typos like "OTP" (→ TOP) are not mistaken for FGI country codes.
3372 if CapcoTokenSet.is_trigraph(first) {
3373 let second = tokens.next().unwrap_or("");
3374 let second = second.trim_end_matches(',');
3375 if matches!(
3376 second,
3377 "U" | "R"
3378 | "C"
3379 | "S"
3380 | "TS"
3381 | "UNCLASSIFIED"
3382 | "RESTRICTED"
3383 | "CONFIDENTIAL"
3384 | "SECRET"
3385 ) {
3386 return true;
3387 }
3388 if second == "TOP"
3389 && tokens
3390 .next()
3391 .is_some_and(|t| t.trim_end_matches(',') == "SECRET")
3392 {
3393 return true;
3394 }
3395 }
3396 false
3397}
3398
3399/// Prepends the non-US leading `//` when the entire input (no existing `//`)
3400/// looks like a non-US classification segment.
3401///
3402/// This covers bare non-US markings like `NS`, `JOINT S GBR USA`, or
3403/// `CAN S` that arrive with no delimiter at all — `try_canonical_reorder`
3404/// cannot act on them because it requires at least two `//`-separated
3405/// segments. Emitting `//NS`, `//JOINT S GBR USA`, etc. lets the strict
3406/// parser recognize the non-US code path (CAPCO-2016 §A.6, parser block 1).
3407fn try_add_non_us_prefix(text: &str) -> Option<String> {
3408 // Only act when there is no `//` at all — try_canonical_reorder
3409 // handles the has-// but missing-prefix case.
3410 if text.contains("//") {
3411 return None;
3412 }
3413 let (prefix, body, suffix) = if text.starts_with('(') && text.ends_with(')') {
3414 ("(", &text[1..text.len() - 1], ")")
3415 } else {
3416 ("", text, "")
3417 };
3418 if is_non_us_classification_segment(body.trim()) {
3419 Some(format!("{prefix}//{body}{suffix}"))
3420 } else {
3421 None
3422 }
3423}
3424
3425// ---------------------------------------------------------------------------
3426// FR-011 strict-context floor
3427// ---------------------------------------------------------------------------
3428
3429/// True when `marking`'s classification level is ≥ `floor`.
3430///
3431/// FR-011 invariant. `floor` is the `Classification as u8` encoding
3432/// (Unclassified=0 … TopSecret=4) — see [`ParseContext::classification_floor`].
3433///
3434/// A marking with no classification info cannot clear a non-trivial
3435/// floor — return `false` so the candidate is dropped when the floor
3436/// is CONFIDENTIAL or above.
3437fn meets_classification_floor(marking: &CapcoMarking, floor: u8) -> bool {
3438 let Some(level) = marking_classification(marking) else {
3439 return floor == Classification::Unclassified as u8;
3440 };
3441 (level as u8) >= floor
3442}
3443
3444/// Extract the effective classification level from a parsed marking.
3445///
3446/// Delegates to [`marque_ism::MarkingClassification::effective_level`],
3447/// which handles all variants (`Us`, `Fgi`, `Nato`, `Joint`,
3448/// `Conflict`) by mapping each to the canonical [`Classification`]
3449/// ladder. NATO levels map through
3450/// [`NatoClassification::us_equivalent`](marque_ism::NatoClassification::us_equivalent).
3451fn marking_classification(marking: &CapcoMarking) -> Option<Classification> {
3452 marking
3453 .0
3454 .classification
3455 .as_ref()
3456 .map(|c| c.effective_level())
3457}
3458
3459/// True when the parsed marking carries at least one recognized
3460/// attribute — any classification, SCI / SAR / AEA / FGI / dissem /
3461/// REL-TO entry, or CAB field (Classified By, Derived From,
3462/// Declassify On, declass exemption).
3463///
3464/// Distinct from [`strict_parse_is_complete`]: a marking can be
3465/// nontrivial (has a dissem control) while still being incomplete
3466/// (missing its classification). The dispatcher consults both — a
3467/// strict result is only accepted when it is BOTH nontrivial AND
3468/// complete; otherwise the decoder is invoked to try to recover the
3469/// missing pieces.
3470///
3471/// True when `bytes` is a portion-shaped slice whose inner content
3472/// is exactly one ASCII letter — `(s)`, `(c)`, `(u)`, `(r)`, `(S)`,
3473/// etc. Tolerant of leading whitespace; the strict recognizer
3474/// already accepts a small amount of leading whitespace on portion
3475/// candidates (`StrictRecognizer::recognize` strips it before
3476/// parsing) and the prose-glue heuristic must do the same so the
3477/// caller's `cx.preceded_by_whitespace` flag remains the authoritative
3478/// signal for "is this glued to a word."
3479///
3480/// Used by [`DecoderRecognizer::recognize`] for the prose-glue
3481/// suppression early-out. A 2-letter inner content like `(TS)` is
3482/// outside the heuristic's scope — multi-letter classification
3483/// abbrevs are rare in prose and don't share the plural-suffix
3484/// confusability that drives this filter.
3485fn is_single_letter_portion(bytes: &[u8]) -> bool {
3486 let trimmed = bytes
3487 .iter()
3488 .position(|b| !b.is_ascii_whitespace())
3489 .map(|i| &bytes[i..])
3490 .unwrap_or(bytes);
3491 matches!(trimmed, [b'(', inner, b')'] if inner.is_ascii_alphabetic())
3492}
3493
3494/// Used inside the decoder itself to filter out lenient-parse-
3495/// accepts-anything results (`FROBNITZ//WIBBLE` trip-fires the
3496/// banner scanner and produces a zero-attribute parse); without
3497/// the filter, every `X//Y` prose fragment would materialize a
3498/// fabricated empty marking candidate.
3499fn is_nontrivial_marking(marking: &CapcoMarking) -> bool {
3500 let a = &marking.0;
3501 a.classification.is_some()
3502 || !a.sci_controls.is_empty()
3503 || a.sar_markings.is_some()
3504 || !a.aea_markings.is_empty()
3505 || a.fgi_marker.is_some()
3506 || !a.dissem_controls.is_empty()
3507 || !a.non_ic_dissem.is_empty()
3508 || !a.rel_to.is_empty()
3509 || a.classified_by.is_some()
3510 || a.derived_from.is_some()
3511 || a.declassify_on.is_some()
3512 || a.declass_exemption.is_some()
3513}
3514
3515/// True when the strict-parse result is complete enough that the
3516/// dispatcher should accept it and skip the decoder fallback.
3517///
3518/// The strict parser (`marque_core::Parser`) is lenient about
3519/// content: it categorizes tokens by *position* (the first token
3520/// inside `(...)` is marked as `TokenKind::Classification`
3521/// regardless of whether its text is a valid classification value),
3522/// and falls back to `TokenKind::Unknown` only for truly unplaceable
3523/// tokens. So a shape like `(SERCET//NOFORN)` parses to a marking
3524/// with `classification: None` (SERCET doesn't resolve to any
3525/// `Classification` variant), `dissem_controls: [Nf]` (NOFORN was
3526/// recognized), and a Classification-kind `TokenSpan` carrying the
3527/// literal text "SERCET". That result is *nontrivial* but also
3528/// *incomplete* — exactly the mangled-input case the decoder exists
3529/// to recover.
3530///
3531/// Predicate, kind-aware:
3532///
3533/// - [`MarkingType::Portion`] / [`MarkingType::Banner`]: complete
3534/// iff `classification.is_some()` AND no `TokenKind::Unknown`
3535/// spans survived. Both branches matter — SERCET→None catches
3536/// the classification-slot typo; the `Unknown` check catches
3537/// typos in the tail (e.g., `(S//FRBN)` where the classification
3538/// is fine but FRBN is mangled and lands as Unknown).
3539/// - [`MarkingType::Cab`]: complete iff any CAB field is present
3540/// (`classified_by` / `derived_from` / `declassify_on`).
3541/// CAB-kind input doesn't require a classification axis — an
3542/// isolated authority block stands on its own.
3543/// - Anything else: fall back to the generic nontrivial check.
3544fn strict_parse_is_complete(marking: &CapcoMarking, kind: MarkingType) -> bool {
3545 use marque_ism::TokenKind;
3546 let attrs = &marking.0;
3547 match kind {
3548 MarkingType::Portion | MarkingType::Banner => {
3549 attrs.classification.is_some()
3550 && !attrs
3551 .token_spans
3552 .iter()
3553 .any(|s| matches!(s.kind, TokenKind::Unknown))
3554 }
3555 MarkingType::Cab => {
3556 attrs.classified_by.is_some()
3557 || attrs.derived_from.is_some()
3558 || attrs.declassify_on.is_some()
3559 || attrs.declass_exemption.is_some()
3560 }
3561 _ => is_nontrivial_marking(marking),
3562 }
3563}
3564
3565// ---------------------------------------------------------------------------
3566// Scoring
3567// ---------------------------------------------------------------------------
3568
3569/// Floor log-prior for canonical tokens that don't appear in the
3570/// baked `TOKEN_BASE_RATES` table.
3571///
3572/// Baked priors are `log((hits + 1) / (total + |V|))` with
3573/// Laplace smoothing over the non-IC Enron corpus (see
3574/// `tools/corpus-analysis/analyze.py::derive_priors`). A token the
3575/// corpus never observed still receives a non-zero smoothed prior in
3576/// that build; this constant exists for the different, rarer case
3577/// where the canonical-tokens iterator produces a string that was
3578/// not in the build's vocabulary at all (e.g., a CVE token added
3579/// after the last priors regeneration). Without this floor, such
3580/// tokens would silently contribute `0.0` to the sum — and since
3581/// every real log-prior is negative, a missing token would score
3582/// HIGHER than a known one, inverting the ranking.
3583///
3584/// Magnitude (`-12.0` nats ≈ log(6e-6)) is chosen to be strictly
3585/// lower than every log-prior the generator would emit for a
3586/// non-empty corpus: the Enron-derived values bottom out around
3587/// `-11.7` for the most infrequent observed tokens (see
3588/// `crates/capco/corpus/priors.json`).
3589const MISSING_TOKEN_LOG_PRIOR: f32 = -12.0;
3590
3591/// Posterior penalty applied when a candidate's strict parse buries a
3592/// reserved dissem-control token (a hard splitter — see
3593/// [`is_hard_splitter`]) inside a SAR or SCI sub-component slot.
3594///
3595/// **Why this exists.** Hard-splitter tokens (NOFORN, ORCON, EXDIS,
3596/// FOUO, …) have hard reserved meanings as dissem controls per CAPCO-
3597/// 2016 §H.8/§H.9; they have no in-segment role inside SCI or SAR
3598/// sub-components. A strict parse that places such a token under
3599/// [`marque_ism::SarMarking`] or [`marque_ism::SciMarking`] is
3600/// essentially always a missing-
3601/// `//` artifact in the input — the alternative parse with the token
3602/// emitted as a dissem control is the correct interpretation. (REL
3603/// TO is intentionally excluded from the penalty surface here: its
3604/// payload is a list of country trigraphs whose grammar accepts only
3605/// 3-letter alpha codes drawn from the CVE-derived trigraph table,
3606/// so a 4+-char hard splitter cannot land in a REL TO slot in the
3607/// first place. The Copilot review on PR #178 flagged a wider doc
3608/// claim that suggested otherwise — the doc is now scoped to the
3609/// slots the penalty actually defends.)
3610///
3611/// **Why scoring needs help.** The bag-of-tokens scorer above sums
3612/// log-priors for the marking's canonical tokens, and `canonical_tokens_for`
3613/// deliberately excludes SAR program/compartment/sub-compartment text
3614/// (open-set agency-assigned codewords). So an absorbing parse contributes
3615/// only the classification's prior; the equivalent delim-inserted parse
3616/// contributes classification + the dissem token's prior, which is a
3617/// MORE NEGATIVE log-posterior. Without a corrective penalty the
3618/// absorbing parse always wins. SCI absorption usually self-resolves
3619/// because [`marque_core::Parser::parse`]'s SCI subgrammar produces
3620/// [`marque_ism::TokenKind::Unknown`] for non-alphanumeric/wrong-shape
3621/// compartment tokens (which step 3a then drops), but SAR's grammar accepts any
3622/// `[A-Z0-9]+` identifier and absorbs cleanly — leaving SAR as the
3623/// observed failure mode on the SC-004 corpus (the `SAR-BP-J12 …` and
3624/// `SPECIAL ACCESS REQUIRED-BUTTER POPCORN …` fixtures pre-PR-5).
3625///
3626/// **Magnitude.** Empirically the absorbing-vs-delim-inserted spread
3627/// on those two fixtures is ~9 nats; the [`MISSING_TOKEN_LOG_PRIOR`]
3628/// floor (`-12.0`) gives a comfortable margin and is robust to small
3629/// future shifts in the priors table. Defining the penalty as
3630/// `MISSING_TOKEN_LOG_PRIOR` (rather than re-stating the literal)
3631/// keeps the two below-floor signals mechanically at parity for any
3632/// candidate that triggers both — a future ratchet of one constant
3633/// pulls the other along.
3634///
3635/// **Safety.** Hard-splitter tokens are all 4+ chars and have shapes
3636/// distinct from real SAR identifiers (`BP`, `CD`, `XR` are 2-char;
3637/// `BUTTER POPCORN`, `J12`, `K15`, `XRA` are alphanumeric short
3638/// codes that don't collide with the hard-splitter list). So this
3639/// penalty cannot fire on a legitimate SAR/SCI parse.
3640const HARD_SPLITTER_ABSORPTION_PENALTY: f32 = MISSING_TOKEN_LOG_PRIOR;
3641
3642/// Per-entry structural penalty for SCI markings whose control system
3643/// landed as [`SciControlSystem::Custom`]. Issue #133 PR 6.
3644///
3645/// **Why this penalty exists.** `marque_core::Parser`'s structural SCI
3646/// subparser (CAPCO-2016 §A.6 grammar) accepts any alphanumeric
3647/// identifier as a "custom" control system / compartment when the
3648/// segment text contains `-` or `/`. That branch was added so legal
3649/// compound SCI shapes (`SI-G ABCD DEFG-MMM AACD`) parse correctly,
3650/// but it has a side effect: a typo'd or stray segment like
3651/// `USAR-BP-J12 J54-K15/CD-YYY 456 689/XR-XRA RB` parses cleanly into
3652/// three `Custom`-system SCI markings (USAR/CD/XR with attached
3653/// compartments). The bag-of-tokens scorer can't tell that this is
3654/// the wrong interpretation — `Custom` SCI control systems don't
3655/// appear in `canonical_tokens_for`, so they don't shift the prior
3656/// either way, and the candidate ties with structurally-richer
3657/// alternatives like the SAR-repaired candidate that
3658/// `try_sar_indicator_repair` emits.
3659///
3660/// **What the penalty does.** Adds [`MISSING_TOKEN_LOG_PRIOR`] (the
3661/// same below-observed-floor magnitude as
3662/// [`HARD_SPLITTER_ABSORPTION_PENALTY`]) per `Custom`-system SCI
3663/// marking. The penalty is per-entry so candidates that absorbed
3664/// multiple stray segments (like the 3-segment USAR/CD/XR case) get
3665/// progressively worse posteriors, restoring the SAR-repair
3666/// candidate's lead by a margin that clears
3667/// [`UNAMBIGUOUS_LOG_MARGIN`].
3668///
3669/// **Magnitude.** Same `-12.0` as the hard-splitter penalty: both are
3670/// "this parse pattern is highly unlikely in well-formed CAPCO
3671/// markings" structural signals, and keeping them at parity by
3672/// definition (rather than literal duplication) lets a future
3673/// ratchet of one move both together. A single legitimate custom
3674/// control (the §A.6 p16 `99` example) gets one `-12.0` hit but
3675/// remains the sole candidate when no alternative interpretation
3676/// exists, so the dispatcher still emits `Unambiguous`.
3677///
3678/// **Safety / discriminator choice.** The discriminator is
3679/// `sm.system == SciControlSystem::Custom(_)`, NOT
3680/// `sm.canonical_enum.is_none()`. The two are NOT equivalent:
3681/// `canonical_enum` is also `None` for legitimate `Published`-system
3682/// SCI markings whenever the `{system}-{first_compartment}` pair
3683/// doesn't map to a CVE atom (per the `canonical_enum` doc in
3684/// `crates/scheme/src/scheme.rs` — populated only when "the bare
3685/// control or `{ctrl}-{first_comp}` matches a CVE value AND no
3686/// sub-compartments are present"). Using `canonical_enum` as the
3687/// discriminator would penalize legitimate `SI-G ABCD DEFG-MMM AACD`-
3688/// style markings (system=`Published(Si)`, sub-compartments present
3689/// → canonical_enum=None), broadly skewing scoring against rich
3690/// SCI shapes. Discriminating on `system` directly catches the
3691/// USAR/CD/XR custom-only case while leaving every published SCI
3692/// marking — bare or compound — untouched. A candidate with mixed
3693/// SCI (e.g., `SI-G ABCD//99`) gets a single penalty for the `99`
3694/// `Custom` entry only, which is a reasonable cost for a
3695/// structurally suspicious mixed shape. The penalty does NOT fire
3696/// on candidates with empty `sci_markings` — so the SAR-repaired
3697/// candidate (which projects no SCI) is unaffected.
3698const CUSTOM_SCI_MARKING_PENALTY: f32 = MISSING_TOKEN_LOG_PRIOR;
3699
3700// (`LENIENT_REL_PREFIX_PENALTY` removed — under the current PR-9
3701// architecture, `try_rel_to_structural_repair` runs as preprocessing
3702// on the normalized text before any candidate is emitted, so
3703// `RELT O ` / `REL OT ` patterns at a token boundary are rewritten
3704// to canonical `REL TO ` before scoring sees them. The defense-in-
3705// depth scorer penalty that PR 9 originally introduced was meant to
3706// break a tie between competing raw vs. repaired *candidates* —
3707// that tie no longer exists since the repair is no longer a
3708// separate candidate. The accuracy harness
3709// (`resolution_rate_at_0_85`, `resolution_rate_does_not_regress`,
3710// per-class floors) is the load-bearing regression gate for this
3711// recovery path. Issue #186 (REL TO trigraph corpus-weighted
3712// recovery) is the followup that handles the remaining lenient-
3713// header cases via priors rather than scorer penalties.)
3714
3715/// Bag-of-tokens scorer (foundational-plan §5.2).
3716///
3717/// Returns `(prior, posterior)` where:
3718///
3719/// - `prior` = Σ [`marque_capco::priors::token_log_prior`] over the
3720/// marking's canonical tokens **plus** Σ
3721/// [`marque_capco::priors::country_code_log_prior`] over the
3722/// marking's `rel_to` country codes (issue #233). This is the prior
3723/// alone — nothing else — and is what
3724/// [`Candidate::prior_log_odds`] is documented to carry (see
3725/// `crates/scheme/src/ambiguity.rs`). Tokens or country codes
3726/// missing from the baked tables contribute
3727/// [`MISSING_TOKEN_LOG_PRIOR`] (a below-observed-floor penalty)
3728/// rather than `0.0`. The country-code contribution is what lets
3729/// the decoder break fuzzy-correction ties between common (USA,
3730/// GBR, AUS) and rare-lookalike (USB-not-a-country, UZB, ASM, AUT)
3731/// trigraphs in REL TO blocks.
3732/// - `posterior` = `prior + Σ attempt.features[i].delta + structural
3733/// penalties`. This is the quantity the decoder sorts and thresholds
3734/// on. The only structural penalty today is
3735/// [`HARD_SPLITTER_ABSORPTION_PENALTY`], applied when the strict
3736/// parse buries a reserved dissem-control token in a SAR/SCI slot.
3737///
3738/// Splitting the two prevents the caller from writing the full
3739/// posterior into `Candidate::prior_log_odds` — that would double-
3740/// count the feature deltas once any resolver re-adds
3741/// `EvidenceFeature.log_odds`. Structural penalties are deliberately
3742/// folded into the posterior only (not the prior or the per-feature
3743/// log-odds): they are a likelihood statement about parse plausibility,
3744/// not a corpus-frequency claim about token co-occurrence.
3745///
3746/// Precision: computed in `f32` — the baked priors are already `f32`
3747/// and the feature deltas are small constants (single-digit magnitude
3748/// at most), so the accumulator doesn't need `f64` headroom for the
3749/// K=8 candidate set.
3750fn score_candidate(attempt: &CanonicalAttempt, marking: &CapcoMarking) -> (f32, f32) {
3751 // Prior: sum of baked log-priors for the canonical tokens that
3752 // appear in the parsed marking. Tokens missing from the baked
3753 // table receive the floor penalty rather than a neutral 0.0
3754 // contribution — see the MISSING_TOKEN_LOG_PRIOR doc.
3755 let mut prior: f32 = 0.0;
3756 let tokens = canonical_tokens_for(marking);
3757 for token in tokens {
3758 prior += marque_capco::priors::token_log_prior(token).unwrap_or(MISSING_TOKEN_LOG_PRIOR);
3759 }
3760
3761 // Country-code prior contribution (issue #233). REL TO country
3762 // codes are not part of the `canonical_tokens_for` set because
3763 // `CountryCode::as_str()` returns a borrowed `&str` rather than
3764 // `&'static str`, and because the per-token corpus coverage for
3765 // country codes used to be sparse. Issue #233 adds a parallel
3766 // `COUNTRY_CODE_BASE_RATES` table (issue #186 sub-feature 1) so
3767 // the decoder can break fuzzy ties between popular codes (USA,
3768 // GBR, AUS, FVEY, …) and rare lookalikes (UZB, ASM,
3769 // AUT-as-Austria) by log-prior delta rather than edit distance
3770 // alone. Look up each observed REL TO code at score-time —
3771 // shape-agnostic, so the loop handles 2-char (`EU`), 3-char, and
3772 // 4-char tetragraphs uniformly. Duplicate REL TO entries do not
3773 // provide additional evidence, so score each distinct country
3774 // code at most once. Unknown entries fall to
3775 // MISSING_TOKEN_LOG_PRIOR — the same penalty the non-country-code
3776 // path uses for unrecognized tokens, which is the correct
3777 // behavior for a candidate that resolved to a non-CVE country
3778 // string.
3779 let mut seen_rel_to_codes = BTreeSet::new();
3780 for country in marking.0.rel_to.iter() {
3781 if seen_rel_to_codes.insert(country.as_str()) {
3782 prior += marque_capco::priors::country_code_log_prior(country.as_str())
3783 .unwrap_or(MISSING_TOKEN_LOG_PRIOR);
3784 }
3785 }
3786
3787 // Posterior: prior plus feature deltas plus structural penalties.
3788 let feature_sum: f32 = attempt.features.iter().map(|f| f.delta).sum();
3789 let mut posterior = prior + feature_sum;
3790 if absorbs_hard_splitter_in_sar_or_sci(marking) {
3791 posterior += HARD_SPLITTER_ABSORPTION_PENALTY;
3792 }
3793 posterior += custom_sci_marking_penalty(marking);
3794
3795 (prior, posterior)
3796}
3797
3798/// Total per-entry penalty for SCI markings whose strict parse landed
3799/// with [`SciControlSystem::Custom`] as the control system. See
3800/// [`CUSTOM_SCI_MARKING_PENALTY`] for rationale, including why this
3801/// discriminates on `sm.system` rather than on
3802/// `sm.canonical_enum.is_none()`.
3803fn custom_sci_marking_penalty(marking: &CapcoMarking) -> f32 {
3804 let attrs = &marking.0;
3805 let custom_count = attrs
3806 .sci_markings
3807 .iter()
3808 .filter(|sm| matches!(sm.system, SciControlSystem::Custom(_)))
3809 .count();
3810 custom_count as f32 * CUSTOM_SCI_MARKING_PENALTY
3811}
3812
3813/// True when the strict parse of a candidate buries a hard-splitter
3814/// dissem-control token (NOFORN, ORCON, EXDIS, FOUO, …) inside a SAR
3815/// program/compartment/sub-compartment slot or an SCI compartment/
3816/// sub-compartment slot.
3817///
3818/// Used by [`score_candidate`] to apply
3819/// [`HARD_SPLITTER_ABSORPTION_PENALTY`] — the penalty exists because
3820/// SAR's grammar accepts any alphanumeric identifier and quietly
3821/// absorbs trailing dissem-control tokens that should have been
3822/// separated from the SAR block by `//`. See the
3823/// `HARD_SPLITTER_ABSORPTION_PENALTY` doc for the full rationale.
3824///
3825/// Identifiers are checked both as whole strings AND as whitespace-
3826/// separated word sequences. The whitespace split matters for the
3827/// `Full` SAR indicator form (`SPECIAL ACCESS REQUIRED-BUTTER
3828/// POPCORN`): a multi-word program nickname like `"BUTTER POPCORN"`
3829/// may have `NOFORN` absorbed as a trailing word, producing
3830/// `identifier: "BUTTER POPCORN NOFORN"`. Without the per-word
3831/// check, the absorption pattern slips past the whole-string
3832/// `is_hard_splitter` lookup.
3833fn absorbs_hard_splitter_in_sar_or_sci(marking: &CapcoMarking) -> bool {
3834 let attrs = &marking.0;
3835
3836 if let Some(sar) = attrs.sar_markings.as_ref() {
3837 for prog in sar.programs.iter() {
3838 if contains_hard_splitter_word(&prog.identifier) {
3839 return true;
3840 }
3841 for comp in prog.compartments.iter() {
3842 if contains_hard_splitter_word(&comp.identifier) {
3843 return true;
3844 }
3845 if comp
3846 .sub_compartments
3847 .iter()
3848 .any(|sub| contains_hard_splitter_word(sub))
3849 {
3850 return true;
3851 }
3852 }
3853 }
3854 }
3855
3856 for sci in attrs.sci_markings.iter() {
3857 for comp in sci.compartments.iter() {
3858 if contains_hard_splitter_word(&comp.identifier) {
3859 return true;
3860 }
3861 if comp
3862 .sub_compartments
3863 .iter()
3864 .any(|sub| contains_hard_splitter_word(sub))
3865 {
3866 return true;
3867 }
3868 }
3869 }
3870
3871 false
3872}
3873
3874/// True when `s` is a hard-splitter token, or contains a hard-
3875/// splitter token as a whitespace-separated word. The per-word check
3876/// covers multi-word `Full` SAR program nicknames (`BUTTER POPCORN`)
3877/// that absorbed a trailing dissem-control word.
3878fn contains_hard_splitter_word(s: &str) -> bool {
3879 if is_hard_splitter(s) {
3880 return true;
3881 }
3882 s.split_whitespace().any(is_hard_splitter)
3883}
3884
3885/// Enumerate the canonical tokens present in `marking` that have a
3886/// `&'static str` representation suitable for
3887/// [`marque_capco::priors::TOKEN_BASE_RATES`] lookup.
3888///
3889/// Scored token families, by `IsmAttributes` field:
3890///
3891/// - `classification` — effective level's banner string
3892/// (`SECRET`, `TOP SECRET`, ...).
3893/// - `sci_controls` — each variant's `as_str()` (`SI`, `TK`, `HCS-P`, ...).
3894/// - `dissem_controls` — IC dissem variants' `as_str()`
3895/// (`NF`, `OC`, `RELIDO`, ...).
3896/// - `non_ic_dissem` — non-IC dissem variants' `banner_str()`
3897/// (`LIMDIS`, `EXDIS`, `NODIS`, `SBU`, `LES`, ...).
3898/// - `aea_markings` — category token `"AEA"` when any AEA marking is
3899/// present. Individual AEA sub-variants (RD / FRD / CNWDI /
3900/// SIGMA / UCNI variants) are not broken out for scoring because
3901/// the baked priors don't carry per-sub-variant base rates and
3902/// adding floor-penalty contributions for each variant would hurt
3903/// AEA-bearing candidates across the board.
3904/// - `fgi_marker` — category token `"FGI"` when an FGI marker is set.
3905///
3906/// Deliberately NOT included here:
3907///
3908/// - `sar_markings` — SAR program identifiers are agency-assigned
3909/// codewords (open set, not in the baked priors).
3910/// - `rel_to` country codes — scored separately in
3911/// [`score_candidate`] via
3912/// [`marque_capco::priors::country_code_log_prior`] (issue #233).
3913/// `CountryCode::as_str()` returns a `&str` tied to `&self`, not
3914/// `&'static str`, so the country-code contribution is summed at
3915/// score-time rather than collected here.
3916/// - CAB fields (`classified_by`, `derived_from`, `declassify_on`) —
3917/// free-form text, not CVE-enumerable.
3918///
3919/// Expansion work is tracked in future PRs alongside any priors
3920/// regeneration that widens coverage (e.g., counting SAR indicator
3921/// base rates from a larger corpus).
3922fn canonical_tokens_for(marking: &CapcoMarking) -> Vec<&'static str> {
3923 let attrs = &marking.0;
3924 let mut tokens: BTreeSet<&'static str> = BTreeSet::new();
3925
3926 if let Some(class) = attrs.classification.as_ref() {
3927 // Use the effective level's banner form as the classification
3928 // token — this is the form the priors corpus keys on for the
3929 // "common classification appears" prior.
3930 tokens.insert(class.effective_level().banner_str());
3931 }
3932
3933 for ctrl in attrs.sci_controls.iter() {
3934 tokens.insert(ctrl.as_str());
3935 }
3936 for dis in attrs.dissem_controls.iter() {
3937 tokens.insert(dis.as_str());
3938 }
3939 for nic in attrs.non_ic_dissem.iter() {
3940 // `NonIcDissem::banner_str` returns `&'static str` with the
3941 // banner form (LIMDIS, EXDIS, NODIS, SBU, LES, SSI,
3942 // SBU NOFORN, LES NOFORN). The compound forms ("SBU NOFORN",
3943 // "LES NOFORN") won't hit a single-token priors entry — they
3944 // fall to MISSING_TOKEN_LOG_PRIOR. That's fine: the
3945 // comparison against peer candidates remains consistent.
3946 tokens.insert(nic.banner_str());
3947 }
3948 if !attrs.aea_markings.is_empty() {
3949 tokens.insert("AEA");
3950 }
3951 if attrs.fgi_marker.is_some() {
3952 tokens.insert("FGI");
3953 }
3954
3955 tokens.into_iter().collect()
3956}
3957
3958// ---------------------------------------------------------------------------
3959// Strict + decoder dispatcher
3960// ---------------------------------------------------------------------------
3961
3962/// Recognizer that runs the strict path first and falls back to the
3963/// decoder when the strict parse yields no meaningful attributes.
3964///
3965/// Default recognizer installed by [`crate::Engine::new`]. Callers
3966/// that need strict-only dispatch (the SC-001 interactive-latency
3967/// benchmark, tests asserting strict behavior) install
3968/// [`StrictRecognizer`] explicitly via
3969/// [`crate::Engine::with_recognizer`].
3970///
3971/// Within this recognizer, dispatch is keyed off
3972/// [`ParseContext::strict_evidence`]:
3973///
3974/// - `strict_evidence = true`: collapse to strict-only behavior. The
3975/// decoder is not called. The engine never sets this; it's reserved
3976/// for callers (e.g., test code) that construct a `ParseContext`
3977/// directly and want to drive only the strict half of the dispatcher.
3978/// - `strict_evidence = false` (the engine default): try strict first.
3979/// Fall back to the decoder when the strict result is either (a)
3980/// zero-candidate `Ambiguous` or (b) `Unambiguous` with an empty /
3981/// trivial [`CapcoMarking`] (no classification, no SCI, no dissem,
3982/// no FGI, etc.). The trivial-Unambiguous case matters because
3983/// `marque_core::Parser` is lenient: it accepts arbitrary
3984/// `BYTES//BYTES` shapes and returns `Ok` with an empty
3985/// `IsmAttributes` when nothing in the input is a recognized CVE
3986/// token. Treating such a result as a successful parse would leave
3987/// the decoder dormant on exactly the mangled inputs it exists to
3988/// recover (`SERCET//NOFORN`, `NOFORN//SECRET`, …). Strict is
3989/// always called with `strict_evidence = true` internally; the
3990/// decoder is always called with `strict_evidence = false`
3991/// internally.
3992///
3993/// Other [`ParseContext`] fields (`zone`, `position`,
3994/// `classification_floor`) pass through unchanged.
3995#[derive(Debug, Default, Clone, Copy)]
3996pub struct StrictOrDecoderRecognizer {
3997 strict: StrictRecognizer,
3998 decoder: DecoderRecognizer,
3999}
4000
4001impl StrictOrDecoderRecognizer {
4002 pub const fn new() -> Self {
4003 Self {
4004 strict: StrictRecognizer::new(),
4005 decoder: DecoderRecognizer::new(),
4006 }
4007 }
4008}
4009
4010impl Recognizer<CapcoScheme> for StrictOrDecoderRecognizer {
4011 fn recognize(&self, bytes: &[u8], cx: &ParseContext) -> Parsed<CapcoMarking> {
4012 let strict_inner_cx = ParseContext {
4013 strict_evidence: true,
4014 ..cx.clone()
4015 };
4016 let strict_result = self.strict.recognize(bytes, &strict_inner_cx);
4017
4018 // When the outer caller asked for strict-only via
4019 // `strict_evidence = true`, collapse to the strict result —
4020 // never call the decoder. The engine never sets this flag (it
4021 // installs `StrictRecognizer` directly via `with_recognizer`
4022 // when a strict-only mode is needed); this branch exists for
4023 // direct callers that construct a `ParseContext` themselves
4024 // (e.g., test code).
4025 if cx.strict_evidence {
4026 return strict_result;
4027 }
4028
4029 // Infer the candidate kind from the byte shape so
4030 // `strict_parse_is_complete` can apply the right rule
4031 // (classification-requiring for portion/banner, CAB-field-
4032 // requiring for CAB). If inference fails the bytes are too
4033 // degenerate for either path — skip and return whatever the
4034 // strict path produced (most likely zero-candidate Ambiguous).
4035 let Some(kind) = infer_marking_type(bytes) else {
4036 return strict_result;
4037 };
4038
4039 // Complete strict parse — take it, decoder not needed.
4040 if matches!(&strict_result, Parsed::Unambiguous(m) if strict_parse_is_complete(m, kind)) {
4041 return strict_result;
4042 }
4043
4044 // Strict already produced non-empty candidates — keep them.
4045 if matches!(&strict_result, Parsed::Ambiguous { candidates } if !candidates.is_empty()) {
4046 return strict_result;
4047 }
4048
4049 // Remaining cases: either an incomplete-but-Unambiguous strict parse
4050 // (partial attrs, `TokenKind::Unknown` spans, missing classification,
4051 // etc.) or a zero-candidate strict Ambiguous. Both warrant a decoder
4052 // attempt. Cases:
4053 // (a) Truly empty attrs (`FROBNITZ//WIBBLE`) — zero-candidate strict.
4054 // (b) Partial attrs (`(SERCET//NOFORN)` — NOFORN parsed, SERCET
4055 // left in a Classification-kind span with
4056 // `attrs.classification = None`) — incomplete Unambiguous.
4057 let decoder_cx = ParseContext {
4058 strict_evidence: false,
4059 ..cx.clone()
4060 };
4061 let decoder_result = self.decoder.recognize(bytes, &decoder_cx);
4062
4063 // Only adopt the decoder result when it produced an Unambiguous
4064 // marking. If the decoder is also uncertain, preserve the strict
4065 // result so rules can still fire on any partial attrs — avoiding
4066 // deep-scan silently reducing observability/diagnostics on
4067 // mangled input.
4068 match decoder_result {
4069 Parsed::Unambiguous(_) => decoder_result,
4070 _ => strict_result,
4071 }
4072 }
4073}
4074
4075// ---------------------------------------------------------------------------
4076// Tests
4077// ---------------------------------------------------------------------------
4078
4079#[cfg(test)]
4080#[cfg_attr(coverage_nightly, coverage(off))]
4081mod tests {
4082 use super::*;
4083
4084 #[test]
4085 fn decoder_is_send_sync_as_trait_object() {
4086 fn assert_send_sync<T: Send + Sync + ?Sized>() {}
4087 assert_send_sync::<DecoderRecognizer>();
4088 assert_send_sync::<StrictOrDecoderRecognizer>();
4089 assert_send_sync::<std::sync::Arc<dyn Recognizer<CapcoScheme>>>();
4090 }
4091
4092 fn deep_cx() -> ParseContext {
4093 ParseContext {
4094 strict_evidence: false,
4095 zone: None,
4096 position: None,
4097 classification_floor: None,
4098 as_of: None,
4099 preceded_by_whitespace: true,
4100 }
4101 }
4102
4103 // ----- Missing-delimiter insertion (issue #133 PR 3) -----
4104
4105 #[test]
4106 fn try_insert_delimiter_inserts_before_long_form_dissem() {
4107 // Hard-splitter rule: long-form dissem after whitespace.
4108 let cases: &[(&str, &str)] = &[
4109 ("SECRET//NOFORN EXDIS", "SECRET//NOFORN//EXDIS"),
4110 ("SECRET//NOFORN ORCON", "SECRET//NOFORN//ORCON"),
4111 ("SECRET//SI ORCON", "SECRET//SI//ORCON"),
4112 ];
4113 for (input, expected) in cases {
4114 let result = try_insert_delimiter(input);
4115 assert_eq!(
4116 result.as_deref(),
4117 Some(*expected),
4118 "input {input:?} should produce {expected:?}; got {result:?}"
4119 );
4120 }
4121 }
4122
4123 #[test]
4124 fn try_insert_delimiter_classification_boundary() {
4125 // Rule 1: classification → next segment.
4126 let cases: &[(&str, &str)] = &[
4127 (
4128 "SECRET REL TO USA, AUS, GBR",
4129 "SECRET//REL TO USA, AUS, GBR",
4130 ),
4131 ("SECRET NOFORN", "SECRET//NOFORN"),
4132 ("TOP SECRET NOFORN", "TOP SECRET//NOFORN"),
4133 ];
4134 for (input, expected) in cases {
4135 let result = try_insert_delimiter(input);
4136 assert_eq!(
4137 result.as_deref(),
4138 Some(*expected),
4139 "input {input:?} should produce {expected:?}; got {result:?}"
4140 );
4141 }
4142 }
4143
4144 #[test]
4145 fn try_insert_delimiter_does_not_split_top_secret() {
4146 // TOP SECRET is the only multi-word classification — the
4147 // helper must not insert `//` between TOP and SECRET.
4148 // The first rule fires only on the first NON-classification
4149 // token; SECRET after TOP is a classification continuation.
4150 let result = try_insert_delimiter("TOP SECRET//NF");
4151 // No insertion needed at all (input is already canonical).
4152 assert_eq!(result, None);
4153 }
4154
4155 #[test]
4156 fn try_insert_delimiter_does_not_split_sbu_noforn() {
4157 // SBU NOFORN is the non-IC dissem banner long form for
4158 // SbuNf — must remain a single multi-word atom.
4159 let result = try_insert_delimiter("SECRET//SBU NOFORN");
4160 assert_eq!(result, None, "SBU NOFORN must not be split; got {result:?}");
4161 }
4162
4163 #[test]
4164 fn try_insert_delimiter_does_not_split_les_noforn() {
4165 // LES NOFORN is the non-IC dissem banner long form for
4166 // LesNf — must remain a single multi-word atom.
4167 let result = try_insert_delimiter("SECRET//LES NOFORN");
4168 assert_eq!(result, None, "LES NOFORN must not be split; got {result:?}");
4169 }
4170
4171 #[test]
4172 fn try_insert_delimiter_no_op_on_canonical() {
4173 // Already-canonical inputs produce None (no insertion).
4174 for input in &[
4175 "SECRET//NOFORN",
4176 "TOP SECRET//SI//NOFORN",
4177 "(S//NF)",
4178 "UNCLASSIFIED",
4179 ] {
4180 let result = try_insert_delimiter(input);
4181 assert_eq!(
4182 result, None,
4183 "input {input:?} is canonical; should produce None, got {result:?}"
4184 );
4185 }
4186 }
4187
4188 #[test]
4189 fn try_insert_delimiter_capped_at_max_insertions() {
4190 // Pathological input with many splitters — the cap should
4191 // limit insertions. Hard cap is `MAX_DELIMITER_INSERTIONS`
4192 // (4 today); 6 splitters in the input should produce at
4193 // most 4 insertions in the output.
4194 let input = "SECRET NOFORN ORCON PROPIN IMCON RELIDO RSEN";
4195 let result = try_insert_delimiter(input);
4196 assert!(result.is_some());
4197 let inserted = result.unwrap();
4198 let inserted_count = inserted.matches("//").count();
4199 assert!(
4200 inserted_count <= MAX_DELIMITER_INSERTIONS,
4201 "must not exceed MAX_DELIMITER_INSERTIONS={MAX_DELIMITER_INSERTIONS}; \
4202 got {inserted_count} insertions in {inserted:?}"
4203 );
4204 }
4205
4206 #[test]
4207 fn try_insert_delimiter_preserves_existing_double_slash() {
4208 // Existing `//` separators must be preserved verbatim.
4209 let result = try_insert_delimiter("SECRET//NOFORN EXDIS");
4210 let s = result.expect("should insert");
4211 // Two `//` total: one preserved in SECRET//NOFORN, plus one
4212 // inserted for NOFORN//EXDIS.
4213 let count = s.matches("//").count();
4214 assert_eq!(
4215 count, 2,
4216 "expected 2 `//` total (1 preserved + 1 inserted), got {count} in {s:?}"
4217 );
4218 }
4219
4220 #[test]
4221 fn try_insert_delimiter_preserves_non_ascii_characters_verbatim() {
4222 // Regression guard for PR #175 review: the helper used to do
4223 // `result.push(bytes[i] as char)` for non-token, non-`/`,
4224 // non-whitespace characters, which corrupts multi-byte UTF-8
4225 // sequences by emitting each byte as a separate Latin-1
4226 // codepoint (e.g., `∕` → 3 garbage codepoints). The fix
4227 // walks `text[i..].chars()` to take one full character and
4228 // advances `i` by `ch.len_utf8()`, preserving the original
4229 // UTF-8 byte sequence in the output.
4230 //
4231 // The fixture below has a stray `∕` (U+2215, 3 bytes in
4232 // UTF-8) that the upstream delimiter normalizer didn't catch.
4233 // The helper must echo the original bytes verbatim into the
4234 // output (no insertion would happen here — there's no
4235 // splitter token after the `∕`), and the round-trip must
4236 // preserve the `∕` character intact.
4237 let input = "SECRET ∕∕ NOFORN";
4238 let result = try_insert_delimiter(input);
4239 // Whether or not the helper emits a result depends on the
4240 // tokenization — what matters is that NO character in the
4241 // output corrupts the `∕` UTF-8 sequence. Test the result
4242 // (or the input passthrough if None).
4243 let was_some = result.is_some();
4244 let s = result.unwrap_or_else(|| input.to_string());
4245 assert!(
4246 s.is_char_boundary(s.len()),
4247 "output {s:?} must end on a char boundary"
4248 );
4249 // The `∕` character (U+2215) must survive intact in the
4250 // output. If the old `bytes[i] as char` shape was still in
4251 // play, the 3-byte UTF-8 sequence [0xE2, 0x88, 0x95] would
4252 // be emitted as three separate codepoints (U+00E2 U+0088
4253 // U+0095), and the original `∕` would not appear.
4254 assert!(
4255 !was_some || s.contains('∕'),
4256 "output {s:?} must preserve the U+2215 character when the \
4257 helper emitted any output"
4258 );
4259 }
4260
4261 #[test]
4262 fn is_hard_splitter_covers_documented_long_forms() {
4263 // Pin the hard-splitter set against accidental shrinkage —
4264 // every long-form dissem from the doc table must remain
4265 // a hard splitter.
4266 for token in &[
4267 "NOFORN",
4268 "ORCON",
4269 "ORCON-USGOV",
4270 "PROPIN",
4271 "IMCON",
4272 "RELIDO",
4273 "RSEN",
4274 "EYESONLY",
4275 "FOUO",
4276 "FISA",
4277 "DSEN",
4278 "EXDIS",
4279 "NODIS",
4280 "LIMDIS",
4281 ] {
4282 assert!(
4283 is_hard_splitter(token),
4284 "{token:?} must be a hard splitter (issue #133 PR 3)"
4285 );
4286 }
4287 }
4288
4289 #[test]
4290 fn is_hard_splitter_excludes_short_forms() {
4291 // Short-form abbreviations (NF, OC, PR, IMC, RS) are
4292 // intentionally excluded — they could collide with SAR
4293 // compartment / sub-compartment naming.
4294 for token in &["NF", "OC", "PR", "IMC", "RS"] {
4295 assert!(
4296 !is_hard_splitter(token),
4297 "{token:?} is intentionally NOT a hard splitter (collision risk)"
4298 );
4299 }
4300 }
4301
4302 // ----- Position-aware classification heuristic (issue #133 PR 2) -----
4303
4304 #[test]
4305 fn heuristic_2char_ts_cluster() {
4306 // T-cluster + S-cluster → TS. Cover the full 6×5 = 30
4307 // combinations that should fire, plus a couple that shouldn't.
4308 for first in &['T', 'R', 'Y', 'H', 'G', 'F'] {
4309 for second in &['A', 'W', 'E', 'Z', 'S'] {
4310 let token: String = [*first, *second].iter().collect();
4311 assert_eq!(
4312 try_2char_classification_heuristic(&token),
4313 Some("TS"),
4314 "{token:?} should heuristic-fix to TS"
4315 );
4316 }
4317 }
4318 // Lowercase variants normalize via the helper's
4319 // to_ascii_uppercase.
4320 assert_eq!(try_2char_classification_heuristic("ys"), Some("TS"));
4321 assert_eq!(try_2char_classification_heuristic("Ys"), Some("TS"));
4322 }
4323
4324 #[test]
4325 fn heuristic_2char_no_match_outside_clusters() {
4326 // First char outside T-cluster → no match.
4327 for token in &["AS", "WS", "ZS", "BS", "DS", "QS"] {
4328 assert_eq!(
4329 try_2char_classification_heuristic(token),
4330 None,
4331 "{token:?} should not heuristic-fix"
4332 );
4333 }
4334 // Second char outside S-cluster → no match.
4335 for token in &["TR", "RY", "HG", "GH", "FB"] {
4336 assert_eq!(
4337 try_2char_classification_heuristic(token),
4338 None,
4339 "{token:?} should not heuristic-fix"
4340 );
4341 }
4342 }
4343
4344 #[test]
4345 fn heuristic_1char_s_cluster() {
4346 // S-key neighbors → S. Bare S is canonical and excluded by
4347 // the upstream `is_canonical_short_classification` guard, so
4348 // the helper returns Some("S") for S-key neighbors and the
4349 // outer logic suppresses the no-op case.
4350 for token in &["A", "W", "E", "Z"] {
4351 assert_eq!(
4352 try_1char_classification_heuristic(token),
4353 Some("S"),
4354 "{token:?} should heuristic-fix to S"
4355 );
4356 }
4357 // X is between C and S; defaults to S per the design note.
4358 assert_eq!(try_1char_classification_heuristic("X"), Some("S"));
4359 }
4360
4361 #[test]
4362 fn heuristic_1char_c_cluster() {
4363 // C-key neighbors → C.
4364 for token in &["V", "F"] {
4365 assert_eq!(
4366 try_1char_classification_heuristic(token),
4367 Some("C"),
4368 "{token:?} should heuristic-fix to C"
4369 );
4370 }
4371 }
4372
4373 #[test]
4374 fn heuristic_1char_no_match_outside_clusters() {
4375 // Letters not in any heuristic cluster.
4376 for token in &["B", "D", "G", "K", "M", "N", "Q", "T", "Y"] {
4377 assert_eq!(
4378 try_1char_classification_heuristic(token),
4379 None,
4380 "{token:?} should not heuristic-fix"
4381 );
4382 }
4383 }
4384
4385 #[test]
4386 fn heuristic_skips_canonical_classifications() {
4387 // Bare canonical short forms must not produce a heuristic
4388 // fix — the strict parser already accepts them.
4389 for canonical in &["U", "R", "C", "S", "TS"] {
4390 assert!(
4391 is_canonical_short_classification(canonical),
4392 "{canonical:?} should be recognized as canonical"
4393 );
4394 }
4395 // And the wrapper helper short-circuits these.
4396 assert_eq!(try_classification_heuristic_fix("(S//NF)"), None);
4397 assert_eq!(try_classification_heuristic_fix("(TS//NF)"), None);
4398 assert_eq!(try_classification_heuristic_fix("(C//NF)"), None);
4399 assert_eq!(try_classification_heuristic_fix("SECRET//NOFORN"), None);
4400 }
4401
4402 #[test]
4403 fn heuristic_fixes_portion_form() {
4404 assert_eq!(
4405 try_classification_heuristic_fix("(YS//NF)").as_deref(),
4406 Some("(TS//NF)")
4407 );
4408 assert_eq!(
4409 try_classification_heuristic_fix("(W//NF)").as_deref(),
4410 Some("(S//NF)")
4411 );
4412 assert_eq!(
4413 try_classification_heuristic_fix("(F//NF)").as_deref(),
4414 Some("(C//NF)")
4415 );
4416 // Lowercase first token (inside parens).
4417 assert_eq!(
4418 try_classification_heuristic_fix("(ys//NF)").as_deref(),
4419 Some("(TS//NF)")
4420 );
4421 }
4422
4423 #[test]
4424 fn heuristic_fixes_banner_form() {
4425 // Banner shapes don't have parens but otherwise behave the
4426 // same — leading classification token in the first segment.
4427 assert_eq!(
4428 try_classification_heuristic_fix("RS//NOFORN").as_deref(),
4429 Some("TS//NOFORN")
4430 );
4431 assert_eq!(
4432 try_classification_heuristic_fix("X//NOFORN").as_deref(),
4433 Some("S//NOFORN")
4434 );
4435 }
4436
4437 #[test]
4438 fn heuristic_skips_cab_shape() {
4439 // CAB lines don't have a leading classification token. The
4440 // `is_cab_head` short-circuit at the top of the helper should
4441 // catch every CAB-keyword prefix.
4442 assert_eq!(try_classification_heuristic_fix("Classified By: foo"), None);
4443 assert_eq!(try_classification_heuristic_fix("Derived From: bar"), None);
4444 assert_eq!(try_classification_heuristic_fix("Declassify On: baz"), None);
4445 }
4446
4447 #[test]
4448 fn heuristic_skips_long_token() {
4449 // 4+ char tokens fall through the length match arm — the
4450 // vocab fuzzy path handles them. 3-char tokens are mostly
4451 // handled by the vocab path too (now that PR 8 added bare
4452 // `TOP` to `EXTENDED_CORRECTION_VOCAB`, shapes like `TPP`
4453 // and `UOP` correct via dist-1 fuzzy); the 3-char heuristic
4454 // is intentionally narrow (only `OTP` → `TOP`) so unrelated
4455 // 3-char tokens like `YES` return None.
4456 assert_eq!(try_classification_heuristic_fix("(YES//NF)"), None);
4457 assert_eq!(try_classification_heuristic_fix("(SECT//NF)"), None);
4458 assert_eq!(try_classification_heuristic_fix("SECRET//NOFORN"), None);
4459 }
4460
4461 // ----- 3-char classification heuristic (issue #133 PR 8) -----
4462
4463 #[test]
4464 fn heuristic_recovers_otp_to_top_via_3char_rule() {
4465 // OTP → TOP: T↔O transposition. Standard Levenshtein dist 2
4466 // blocked by the vocab fuzzy path's `MIN_USEFUL_CONFIDENCE`
4467 // floor; the targeted 3-char heuristic is the recovery path.
4468 let cases: &[(&str, &str)] = &[
4469 ("OTP SECRET//NOFORN", "TOP SECRET//NOFORN"),
4470 ("(OTP//NF)", "(TOP//NF)"),
4471 ("OTP SECRET//SI//NOFORN", "TOP SECRET//SI//NOFORN"),
4472 ];
4473 for (input, expected) in cases {
4474 let result = try_classification_heuristic_fix(input);
4475 assert_eq!(
4476 result.as_deref(),
4477 Some(*expected),
4478 "input {input:?} should heuristic-fix to {expected:?}; got {result:?}"
4479 );
4480 }
4481 }
4482
4483 #[test]
4484 fn try_3char_classification_heuristic_only_matches_otp() {
4485 // The 3-char heuristic is intentionally narrow (a single
4486 // hardcoded `OTP → TOP` mapping). Any other 3-char input
4487 // returns None and falls through to other recovery paths.
4488 // Pinned because the dense 3-char trigraph vocab (TON, TUR,
4489 // TWN, …) means a wider rule would generate too many false
4490 // positives.
4491 assert_eq!(try_3char_classification_heuristic("OTP"), Some("TOP"));
4492 for not_a_match in &["TON", "TPP", "UOP", "TIP", "TPO", "TOO", "ABC", "YES"] {
4493 assert_eq!(
4494 try_3char_classification_heuristic(not_a_match),
4495 None,
4496 "3-char heuristic must not fire on {not_a_match:?}",
4497 );
4498 }
4499 }
4500
4501 // ----- Extended 2-char heuristic for TP/TO → TOP -----
4502
4503 #[test]
4504 fn heuristic_recovers_tp_and_to_to_top_via_2char_rule() {
4505 // PR 8 extended the 2-char heuristic to map `TP`/`TO` → `TOP`.
4506 // These are corpus-attested classification typos where the
4507 // middle `O` (`TP`) or trailing `P` (`TO`) was elided. They
4508 // must not collide with the TS rule because neither `P` nor
4509 // `O` is in the S-cluster.
4510 let cases: &[(&str, &str)] = &[
4511 ("TP SECRET//NOFORN", "TOP SECRET//NOFORN"),
4512 ("TO SECRET//NOFORN", "TOP SECRET//NOFORN"),
4513 ("(TP//NF)", "(TOP//NF)"),
4514 ("(TO//NF)", "(TOP//NF)"),
4515 ];
4516 for (input, expected) in cases {
4517 let result = try_classification_heuristic_fix(input);
4518 assert_eq!(
4519 result.as_deref(),
4520 Some(*expected),
4521 "input {input:?} should heuristic-fix to {expected:?}; got {result:?}"
4522 );
4523 }
4524 }
4525
4526 #[test]
4527 fn try_2char_classification_heuristic_ts_rule_takes_precedence() {
4528 // The TS rule (T-cluster + S-cluster pair) is checked first;
4529 // the TP/TO → TOP rule is a fallback. None of the TP/TO
4530 // characters are in the S-cluster (P, O), so there's no
4531 // ambiguity in practice — but pinning the precedence here
4532 // keeps a future widening of the TP/TO rule from silently
4533 // overriding the TS rule.
4534 // Pure T-cluster + S-cluster → TS.
4535 assert_eq!(try_2char_classification_heuristic("TS"), Some("TS"));
4536 assert_eq!(try_2char_classification_heuristic("RS"), Some("TS"));
4537 assert_eq!(try_2char_classification_heuristic("YS"), Some("TS"));
4538 // T + non-S-cluster → TOP (only for P/O).
4539 assert_eq!(try_2char_classification_heuristic("TP"), Some("TOP"));
4540 assert_eq!(try_2char_classification_heuristic("TO"), Some("TOP"));
4541 // T + other non-S-cluster → still None (don't broaden).
4542 assert_eq!(try_2char_classification_heuristic("TI"), None);
4543 assert_eq!(try_2char_classification_heuristic("TX"), None);
4544 }
4545
4546 #[test]
4547 fn is_canonical_short_classification_recognizes_top() {
4548 // PR 8 added bare `TOP` to the canonical-short set so the
4549 // classification heuristic doesn't fire on already-canonical
4550 // `TOP SECRET//...` input (whose first whitespace-token is
4551 // `TOP`). Pre-PR-8 this was a no-op because the length-3
4552 // heuristic always returned None; PR 8's OTP rule made it
4553 // load-bearing.
4554 assert!(is_canonical_short_classification("TOP"));
4555 // Existing canonical short forms still recognized.
4556 for s in &["U", "R", "C", "S", "TS"] {
4557 assert!(
4558 is_canonical_short_classification(s),
4559 "{s:?} must be recognized as canonical short classification",
4560 );
4561 }
4562 // Non-canonical or wrong-case forms still return false.
4563 assert!(!is_canonical_short_classification("TPP"));
4564 assert!(!is_canonical_short_classification("top")); // case-sensitive
4565 assert!(!is_canonical_short_classification("TOPS"));
4566 }
4567
4568 #[test]
4569 fn heuristic_skips_unknown_first_char() {
4570 // First char isn't in any heuristic cluster → no fix.
4571 assert_eq!(try_classification_heuristic_fix("(B//NF)"), None);
4572 assert_eq!(try_classification_heuristic_fix("(QS//NF)"), None);
4573 }
4574
4575 #[test]
4576 fn heuristic_skips_lone_inputs() {
4577 // Issue #133 PR 4 / #176 lone-input safety guard. The
4578 // heuristic must NOT fire on inputs without marking-shape
4579 // signals beyond the leading token — auto-applying lone-case
4580 // fixes would surface as false positives on parenthetical
4581 // refs like `(A)`, `(W)`, `(F)` that are common in business
4582 // prose. The corpus measurement at PR 4 found `A` alone has
4583 // 214,539 unrestricted body-text occurrences in the Enron
4584 // corpus vs 168 in marking-context — the lone-case FP rate
4585 // is ~3 orders of magnitude higher than the in-context rate.
4586 //
4587 // Form-field input (caller asserts the input IS a marking
4588 // attempt) should still fire; tracking via #176 — when the
4589 // input-source signal lands, this guard becomes conditional.
4590 for lone in &[
4591 "(YS)", // 2-char trigger, parens, nothing else
4592 "(W)", // 1-char trigger
4593 "(F)", // 1-char trigger
4594 "(X)", // 1-char trigger
4595 "YS", // banner-shape lone
4596 "W", // bare lone token
4597 "(YS )", // trailing whitespace only
4598 ] {
4599 assert_eq!(
4600 try_classification_heuristic_fix(lone),
4601 None,
4602 "lone input {lone:?} must not fire heuristic (#133 PR 4 / #176 lone-input guard)"
4603 );
4604 }
4605 }
4606
4607 #[test]
4608 fn heuristic_fires_when_marking_signal_present() {
4609 // Counterpart to `heuristic_skips_lone_inputs`. The guard is
4610 // about LONE inputs only; inputs with ANY marking content
4611 // beyond the leading token (a `//` separator OR another
4612 // whitespace-separated token in the first segment) still
4613 // fire normally.
4614 let cases: &[(&str, &str)] = &[
4615 ("(YS//NF)", "(TS//NF)"), // `//` separator after token
4616 ("(YS NF)", "(TS NF)"), // whitespace + another token
4617 ("YS//NOFORN", "TS//NOFORN"),
4618 ("W//NF", "S//NF"),
4619 ];
4620 for (input, expected) in cases {
4621 let result = try_classification_heuristic_fix(input);
4622 assert_eq!(
4623 result.as_deref(),
4624 Some(*expected),
4625 "input {input:?} should heuristic-fix to {expected:?} \
4626 (marking signal present); got {result:?}"
4627 );
4628 }
4629 }
4630
4631 #[test]
4632 fn decoder_defers_to_strict_when_strict_evidence_is_set() {
4633 let rx = DecoderRecognizer::new();
4634 let cx = ParseContext::default(); // strict_evidence = true
4635 match rx.recognize(b"(S//NF)", &cx) {
4636 Parsed::Ambiguous { candidates } => assert!(candidates.is_empty()),
4637 other => panic!("expected zero-candidate Ambiguous, got {other:?}"),
4638 }
4639 }
4640
4641 #[test]
4642 fn decoder_zero_candidate_on_no_template_fit() {
4643 let rx = DecoderRecognizer::new();
4644 // Neither token is in the vocabulary and no fuzzy match.
4645 match rx.recognize(b"FROBNITZ//WIBBLE", &deep_cx()) {
4646 Parsed::Ambiguous { candidates } => assert!(
4647 candidates.is_empty(),
4648 "unrecognized input must be zero-candidate, got {} candidate(s)",
4649 candidates.len()
4650 ),
4651 Parsed::Unambiguous(m) => panic!("unexpected strict match: {m:?}"),
4652 }
4653 }
4654
4655 #[test]
4656 fn score_candidate_splits_prior_and_posterior() {
4657 // Synthesize a fake attempt with known non-zero feature deltas
4658 // and verify the (prior, posterior) return tuple: posterior
4659 // must be prior + Σ feature.delta, and prior must NOT include
4660 // any of the feature deltas.
4661 let token_set = CapcoTokenSet;
4662 let parser = Parser::new(&token_set);
4663 let candidate = MarkingCandidate {
4664 span: Span::new(0, 14),
4665 kind: MarkingType::Banner,
4666 };
4667 let parsed = parser
4668 .parse(&candidate, b"SECRET//NOFORN")
4669 .expect("SECRET//NOFORN must parse");
4670 let marking = CapcoMarking::new(parsed.attrs);
4671
4672 let features = vec![
4673 FeatureEntry {
4674 id: FeatureId::EditDistance1,
4675 delta: -0.5,
4676 },
4677 FeatureId::TokenReorder.into(),
4678 ];
4679 let attempt = CanonicalAttempt {
4680 bytes: b"SECRET//NOFORN".to_vec(),
4681 features: features.clone(),
4682 fix_source: marque_rules::FixSource::DecoderPosterior,
4683 };
4684 let (prior, posterior) = score_candidate(&attempt, &marking);
4685
4686 let feature_sum: f32 = features.iter().map(|f| f.delta).sum();
4687 let reconstructed = prior + feature_sum;
4688 assert!(
4689 (reconstructed - posterior).abs() < 1e-6,
4690 "posterior must equal prior + Σ feature deltas; \
4691 prior={prior}, feature_sum={feature_sum}, posterior={posterior}"
4692 );
4693 // And the prior alone must differ from the posterior when
4694 // the features carry non-trivial deltas.
4695 assert!(
4696 (prior - posterior).abs() > f32::EPSILON,
4697 "prior_log_odds must exclude feature deltas; \
4698 prior={prior}, posterior={posterior}"
4699 );
4700 }
4701
4702 // Convenience conversion for the test above.
4703 impl From<FeatureId> for FeatureEntry {
4704 fn from(id: FeatureId) -> Self {
4705 Self { id, delta: -0.4 }
4706 }
4707 }
4708
4709 #[test]
4710 fn score_candidate_includes_country_code_prior_for_rel_to() {
4711 // Issue #233: `score_candidate` sums `country_code_log_prior` over
4712 // the `rel_to` slice of the parsed marking. A marking with TWO REL TO
4713 // entries must produce a strictly lower (more negative) prior than the
4714 // same marking with ONE entry, because each country code contributes a
4715 // negative log-prior term and GBR is a known high-frequency trigraph.
4716 let token_set = CapcoTokenSet;
4717 let parser = Parser::new(&token_set);
4718
4719 let one_candidate = MarkingCandidate {
4720 span: Span::new(0, 18),
4721 kind: MarkingType::Banner,
4722 };
4723 let one_parsed = parser
4724 .parse(&one_candidate, b"SECRET//REL TO USA")
4725 .expect("SECRET//REL TO USA must parse");
4726 let one_marking = CapcoMarking::new(one_parsed.attrs);
4727
4728 let two_candidate = MarkingCandidate {
4729 span: Span::new(0, 23),
4730 kind: MarkingType::Banner,
4731 };
4732 let two_parsed = parser
4733 .parse(&two_candidate, b"SECRET//REL TO USA, GBR")
4734 .expect("SECRET//REL TO USA, GBR must parse");
4735 let two_marking = CapcoMarking::new(two_parsed.attrs);
4736
4737 let no_features: Vec<FeatureEntry> = vec![];
4738 let attempt_one = CanonicalAttempt {
4739 bytes: b"SECRET//REL TO USA".to_vec(),
4740 features: no_features.clone(),
4741 fix_source: marque_rules::FixSource::DecoderPosterior,
4742 };
4743 let attempt_two = CanonicalAttempt {
4744 bytes: b"SECRET//REL TO USA, GBR".to_vec(),
4745 features: no_features.clone(),
4746 fix_source: marque_rules::FixSource::DecoderPosterior,
4747 };
4748
4749 let (prior_one, _) = score_candidate(&attempt_one, &one_marking);
4750 let (prior_two, _) = score_candidate(&attempt_two, &two_marking);
4751
4752 // GBR has a known negative log-prior, so adding it to the REL TO
4753 // list must make the total prior strictly more negative.
4754 assert!(
4755 prior_two < prior_one,
4756 "adding GBR to REL TO must lower (more negative) the prior via \
4757 country_code_log_prior; prior_one={prior_one}, prior_two={prior_two}"
4758 );
4759 }
4760
4761 #[test]
4762 fn score_candidate_deduplicates_rel_to_entries() {
4763 // Issue #233 dedup guard: a duplicate REL TO entry (e.g. "USA, USA")
4764 // must score identically to the deduplicated form ("USA") because
4765 // `seen_rel_to_codes` prevents double-counting.
4766 let token_set = CapcoTokenSet;
4767 let parser = Parser::new(&token_set);
4768
4769 let dup_candidate = MarkingCandidate {
4770 span: Span::new(0, 23),
4771 kind: MarkingType::Banner,
4772 };
4773 // Parser may or may not produce two rel_to entries for "USA, USA" —
4774 // the dedup guard must be robust either way: the prior must equal
4775 // that of a single "USA" entry.
4776 let dup_parsed = parser
4777 .parse(&dup_candidate, b"SECRET//REL TO USA, USA")
4778 .expect("SECRET//REL TO USA, USA must parse leniently");
4779 let dup_marking = CapcoMarking::new(dup_parsed.attrs);
4780
4781 let once_candidate = MarkingCandidate {
4782 span: Span::new(0, 18),
4783 kind: MarkingType::Banner,
4784 };
4785 let once_parsed = parser
4786 .parse(&once_candidate, b"SECRET//REL TO USA")
4787 .expect("SECRET//REL TO USA must parse");
4788 let once_marking = CapcoMarking::new(once_parsed.attrs);
4789
4790 let no_features: Vec<FeatureEntry> = vec![];
4791 let attempt_dup = CanonicalAttempt {
4792 bytes: b"SECRET//REL TO USA, USA".to_vec(),
4793 features: no_features.clone(),
4794 fix_source: marque_rules::FixSource::DecoderPosterior,
4795 };
4796 let attempt_once = CanonicalAttempt {
4797 bytes: b"SECRET//REL TO USA".to_vec(),
4798 features: no_features.clone(),
4799 fix_source: marque_rules::FixSource::DecoderPosterior,
4800 };
4801
4802 let (prior_dup, _) = score_candidate(&attempt_dup, &dup_marking);
4803 let (prior_once, _) = score_candidate(&attempt_once, &once_marking);
4804
4805 // Deduplication ensures the duplicate USA is only scored once, so
4806 // both priors must be equal (same base tokens + same single USA prior).
4807 assert!(
4808 (prior_dup - prior_once).abs() < 1e-5,
4809 "duplicate REL TO entry must not double-count the country-code prior; \
4810 prior_dup={prior_dup}, prior_once={prior_once}"
4811 );
4812 }
4813
4814 #[test]
4815 fn feature_entry_to_evidence_uses_canonical_label_registry() {
4816 // Regression guard for PR #142 H2: the projection from
4817 // `FeatureEntry` onto `EvidenceFeature::label` MUST route
4818 // through `FeatureId::as_str()` — the single source of truth
4819 // declared in `crates/rules/src/confidence.rs:208`. A divergent
4820 // local registry (the pre-fix shape, snake_case labels in a
4821 // duplicate match arm) produces wire-format drift the audit
4822 // emitter cannot detect, because today's dispatcher discards
4823 // `Parsed::Ambiguous` results and the bug stays latent.
4824 //
4825 // This test exhaustively covers every `FeatureId` variant. A
4826 // new variant added without an `as_str()` arm fails compilation
4827 // there (the match is exhaustive); a new variant whose label
4828 // diverges from `as_str()` here would have to be deliberately
4829 // wrong, since this test reads `id.as_str()` directly. The
4830 // load-bearing assertion is that `feature_entry_to_evidence`
4831 // does the same thing.
4832 for id in [
4833 FeatureId::EditDistance1,
4834 FeatureId::EditDistance2,
4835 FeatureId::TokenReorder,
4836 FeatureId::SupersededToken,
4837 FeatureId::BaseRateCommonMarking,
4838 FeatureId::StrictContextClassification,
4839 FeatureId::CorpusOverrideInEffect,
4840 ] {
4841 let entry = FeatureEntry { id, delta: -0.5 };
4842 let evidence = feature_entry_to_evidence(&entry);
4843 assert_eq!(
4844 evidence.label,
4845 id.as_str(),
4846 "decoder evidence label diverged from FeatureId::as_str() \
4847 for {id:?}: got {label:?}, expected {expected:?}",
4848 label = evidence.label,
4849 expected = id.as_str(),
4850 );
4851 assert_eq!(evidence.log_odds, -0.5);
4852 }
4853 }
4854
4855 #[test]
4856 fn runner_up_ratio_saturates_on_extreme_log_margin() {
4857 // Regression guard for PR #127 review comment on decoder.rs:305:
4858 // when `log_margin` is large enough that `f32::exp()` overflows
4859 // (≈ ≥ 88.7 nats on f32), the previous code emitted `+∞` into
4860 // `Confidence::runner_up_ratio` and `Confidence::validate`
4861 // rejected the resulting record at the audit boundary,
4862 // panicking inside `FixProposal::new`. The fix saturates at
4863 // `f32::MAX`. We exercise both branches here with bare
4864 // `f32::exp` since the saturation logic is the same closed
4865 // expression used in `recognize`.
4866 for &log_margin in &[88.0_f32, 100.0_f32, 200.0_f32, 1000.0_f32] {
4867 let ratio = log_margin.exp();
4868 let clamped = if ratio.is_finite() { ratio } else { f32::MAX };
4869 assert!(
4870 clamped.is_finite(),
4871 "log_margin = {log_margin}: clamped ratio must be finite, got {clamped}"
4872 );
4873 assert!(
4874 clamped > 0.0,
4875 "log_margin = {log_margin}: clamped ratio must be > 0, got {clamped}"
4876 );
4877 }
4878 // And a sanity check on the in-band path: at the
4879 // UNAMBIGUOUS_LOG_MARGIN threshold, `exp()` returns a finite
4880 // value and we don't clamp.
4881 let at_threshold = UNAMBIGUOUS_LOG_MARGIN.exp();
4882 assert!(at_threshold.is_finite() && at_threshold > 1.0);
4883 }
4884
4885 #[test]
4886 fn strict_parse_is_complete_rejects_unknown_classification() {
4887 // This is the regression-guard for PR #114 review comment
4888 // on decoder.rs:946 — strict parse of `(SERCET//NOFORN)`
4889 // recognizes NOFORN but leaves `classification: None` because
4890 // SERCET doesn't resolve to any `Classification` variant.
4891 // Without the `strict_parse_is_complete` check, the
4892 // dispatcher would accept this as a complete strict result
4893 // and never fall through to the decoder.
4894 let token_set = CapcoTokenSet;
4895 let parser = Parser::new(&token_set);
4896 let candidate = MarkingCandidate {
4897 span: Span::new(0, 16),
4898 kind: MarkingType::Portion,
4899 };
4900 let parsed = parser
4901 .parse(&candidate, b"(SERCET//NOFORN)")
4902 .expect("strict parser should accept (SERCET//NOFORN) leniently");
4903 let marking = CapcoMarking::new(parsed.attrs);
4904 assert!(
4905 is_nontrivial_marking(&marking),
4906 "NOFORN survives as a dissem control → marking is nontrivial"
4907 );
4908 assert!(
4909 !strict_parse_is_complete(&marking, MarkingType::Portion),
4910 "SERCET left `classification: None` → strict parse is incomplete; \
4911 dispatcher must fall back to decoder. attrs = {:?}",
4912 marking.0,
4913 );
4914 }
4915
4916 #[test]
4917 fn strict_parse_is_complete_accepts_clean_marking() {
4918 let token_set = CapcoTokenSet;
4919 let parser = Parser::new(&token_set);
4920 let candidate = MarkingCandidate {
4921 span: Span::new(0, 7),
4922 kind: MarkingType::Portion,
4923 };
4924 let parsed = parser
4925 .parse(&candidate, b"(S//NF)")
4926 .expect("canonical portion must strict-parse");
4927 let marking = CapcoMarking::new(parsed.attrs);
4928 assert!(
4929 strict_parse_is_complete(&marking, MarkingType::Portion),
4930 "canonical (S//NF) must be accepted as complete; attrs = {:?}",
4931 marking.0,
4932 );
4933 }
4934
4935 #[test]
4936 fn strict_parse_is_complete_rejects_trailing_unknown_token() {
4937 // `(S//FRBN)` — classification parses (`S` → Secret) but the
4938 // tail token `FRBN` lands in an `Unknown` span. The
4939 // dispatcher must fall back so the decoder can resolve
4940 // `FRBN` → `NF` (or reject).
4941 let token_set = CapcoTokenSet;
4942 let parser = Parser::new(&token_set);
4943 let candidate = MarkingCandidate {
4944 span: Span::new(0, 9),
4945 kind: MarkingType::Portion,
4946 };
4947 let parsed = parser
4948 .parse(&candidate, b"(S//FRBN)")
4949 .expect("strict parser accepts (S//FRBN) leniently");
4950 let marking = CapcoMarking::new(parsed.attrs);
4951 // `S` resolved, so classification is Some — but the
4952 // Unknown-tail check still fires.
4953 assert!(
4954 !strict_parse_is_complete(&marking, MarkingType::Portion),
4955 "`FRBN` is Unknown-kind → strict parse is incomplete; attrs = {:?}",
4956 marking.0,
4957 );
4958 }
4959
4960 #[test]
4961 fn contains_hard_splitter_word_detects_per_word() {
4962 // Whole-string match.
4963 assert!(contains_hard_splitter_word("NOFORN"));
4964 assert!(contains_hard_splitter_word("ORCON"));
4965 assert!(contains_hard_splitter_word("EXDIS"));
4966 // Per-word match (the `Full` SAR-program-nickname absorption
4967 // shape — `BUTTER POPCORN NOFORN`).
4968 assert!(contains_hard_splitter_word("BUTTER POPCORN NOFORN"));
4969 assert!(contains_hard_splitter_word("ORCON BUTTER POPCORN"));
4970 assert!(contains_hard_splitter_word("BUTTER NOFORN POPCORN"));
4971 // Negatives — clean SAR identifiers must not match.
4972 assert!(!contains_hard_splitter_word("BP"));
4973 assert!(!contains_hard_splitter_word("J12"));
4974 assert!(!contains_hard_splitter_word("XRA"));
4975 assert!(!contains_hard_splitter_word("BUTTER POPCORN"));
4976 assert!(!contains_hard_splitter_word(""));
4977 }
4978
4979 #[test]
4980 fn absorbs_hard_splitter_detects_full_sar_program_with_trailing_noforn() {
4981 // The `SPECIAL ACCESS REQUIRED-BUTTER POPCORN NOFORN` shape:
4982 // strict parser builds a `Full`-indicator SAR with the program
4983 // identifier `"BUTTER POPCORN NOFORN"` (multi-word nickname,
4984 // NOFORN absorbed as the trailing word). Pinned to ensure the
4985 // per-word check in `contains_hard_splitter_word` keeps firing.
4986 use marque_ism::{IsmAttributes, SarIndicator, SarMarking, SarProgram};
4987 let sar = SarMarking::new(
4988 SarIndicator::Full,
4989 Box::new([SarProgram::new(
4990 Box::from("BUTTER POPCORN NOFORN"),
4991 Box::new([]),
4992 )]),
4993 );
4994 let mut attrs = IsmAttributes::default();
4995 attrs.sar_markings = Some(sar);
4996 let marking = CapcoMarking::new(attrs);
4997 assert!(
4998 absorbs_hard_splitter_in_sar_or_sci(&marking),
4999 "NOFORN as trailing word of multi-word SAR program identifier must be detected"
5000 );
5001 }
5002
5003 #[test]
5004 fn absorbs_hard_splitter_in_sar_detects_noforn_as_subcomp() {
5005 // Direct construction: a SAR program with NOFORN buried as a
5006 // sub-compartment of a normal compartment. Mirrors the parse
5007 // shape produced by `SECRET//SAR-BP-J12 J54-K15/CD-YYY 456 689/
5008 // XR-XRA RB NOFORN` when the strict parser absorbs NOFORN at
5009 // the SAR-block tail.
5010 use marque_ism::{IsmAttributes, SarCompartment, SarIndicator, SarMarking, SarProgram};
5011 let sar = SarMarking::new(
5012 SarIndicator::Abbrev,
5013 Box::new([SarProgram::new(
5014 Box::from("BP"),
5015 Box::new([SarCompartment::new(
5016 Box::from("J12"),
5017 Box::new([Box::from("RB"), Box::from("NOFORN")]),
5018 )]),
5019 )]),
5020 );
5021 let mut attrs = IsmAttributes::default();
5022 attrs.sar_markings = Some(sar);
5023 let marking = CapcoMarking::new(attrs);
5024 assert!(
5025 absorbs_hard_splitter_in_sar_or_sci(&marking),
5026 "NOFORN as SAR sub-compartment must be detected as absorption"
5027 );
5028 }
5029
5030 #[test]
5031 fn absorbs_hard_splitter_in_sar_detects_noforn_as_compartment_identifier() {
5032 // PR #178 review (Codecov, decoder.rs:1795): pin the
5033 // SAR-compartment-IDENTIFIER branch (vs the sub-compartment
5034 // branch covered above). Some absorbing parses end up with the
5035 // hard splitter as the compartment identifier itself rather
5036 // than a sub-compartment leaf — e.g., a `SAR-BP NOFORN` shape
5037 // where the strict parser emits `BP` as the program and
5038 // `NOFORN` as a bare compartment with no sub-compartments.
5039 use marque_ism::{IsmAttributes, SarCompartment, SarIndicator, SarMarking, SarProgram};
5040 let sar = SarMarking::new(
5041 SarIndicator::Abbrev,
5042 Box::new([SarProgram::new(
5043 Box::from("BP"),
5044 Box::new([SarCompartment::new(Box::from("NOFORN"), Box::new([]))]),
5045 )]),
5046 );
5047 let mut attrs = IsmAttributes::default();
5048 attrs.sar_markings = Some(sar);
5049 let marking = CapcoMarking::new(attrs);
5050 assert!(
5051 absorbs_hard_splitter_in_sar_or_sci(&marking),
5052 "NOFORN as SAR compartment identifier must be detected as absorption"
5053 );
5054 }
5055
5056 #[test]
5057 fn absorbs_hard_splitter_accepts_clean_sar() {
5058 // Negative case: a SAR with realistic identifiers (`BP`, `J12`,
5059 // `RB`) and no hard-splitter token anywhere. Must NOT trigger
5060 // the penalty.
5061 use marque_ism::{IsmAttributes, SarCompartment, SarIndicator, SarMarking, SarProgram};
5062 let sar = SarMarking::new(
5063 SarIndicator::Abbrev,
5064 Box::new([SarProgram::new(
5065 Box::from("BP"),
5066 Box::new([SarCompartment::new(
5067 Box::from("J12"),
5068 Box::new([Box::from("RB"), Box::from("XRA")]),
5069 )]),
5070 )]),
5071 );
5072 let mut attrs = IsmAttributes::default();
5073 attrs.sar_markings = Some(sar);
5074 let marking = CapcoMarking::new(attrs);
5075 assert!(
5076 !absorbs_hard_splitter_in_sar_or_sci(&marking),
5077 "clean SAR identifiers must not trigger the absorption penalty"
5078 );
5079 }
5080
5081 #[test]
5082 fn absorbs_hard_splitter_in_sci_detects_orcon_as_subcomp() {
5083 // Defensive coverage for SCI absorption — the existing strict-
5084 // parser path drops most SCI absorption via the
5085 // `TokenKind::Unknown` filter in step 3a, but a future grammar
5086 // change that loosens SCI compartment shape could let a hard
5087 // splitter through. Pinned so the penalty stays defensive.
5088 use marque_ism::{
5089 IsmAttributes, SciCompartment, SciControlBare, SciControlSystem, SciMarking,
5090 };
5091 let sci = SciMarking::new(
5092 SciControlSystem::Published(SciControlBare::Si),
5093 Box::new([SciCompartment::new(
5094 Box::from("G"),
5095 Box::new([Box::from("ORCON")]),
5096 )]),
5097 None,
5098 );
5099 let mut attrs = IsmAttributes::default();
5100 attrs.sci_markings = Box::new([sci]);
5101 let marking = CapcoMarking::new(attrs);
5102 assert!(
5103 absorbs_hard_splitter_in_sar_or_sci(&marking),
5104 "ORCON as SCI sub-compartment must be detected as absorption"
5105 );
5106 }
5107
5108 #[test]
5109 fn absorbs_hard_splitter_in_sci_detects_orcon_as_compartment_identifier() {
5110 // PR #178 review (Codecov, decoder.rs:1811): pin the SCI-
5111 // compartment-IDENTIFIER branch (vs the sub-compartment branch
5112 // above). Defensive coverage — today's strict-parser SCI path
5113 // drops most absorption via the `TokenKind::Unknown` filter at
5114 // step 3a, but a future grammar change that lets a hard
5115 // splitter through as the compartment id needs the penalty
5116 // active.
5117 use marque_ism::{
5118 IsmAttributes, SciCompartment, SciControlBare, SciControlSystem, SciMarking,
5119 };
5120 let sci = SciMarking::new(
5121 SciControlSystem::Published(SciControlBare::Si),
5122 Box::new([SciCompartment::new(Box::from("ORCON"), Box::new([]))]),
5123 None,
5124 );
5125 let mut attrs = IsmAttributes::default();
5126 attrs.sci_markings = Box::new([sci]);
5127 let marking = CapcoMarking::new(attrs);
5128 assert!(
5129 absorbs_hard_splitter_in_sar_or_sci(&marking),
5130 "ORCON as SCI compartment identifier must be detected as absorption"
5131 );
5132 }
5133
5134 #[test]
5135 fn absorbs_hard_splitter_negative_on_empty_marking() {
5136 // Sanity floor: a marking with neither SAR nor SCI never
5137 // triggers the penalty.
5138 use marque_ism::IsmAttributes;
5139 let attrs = IsmAttributes::default();
5140 let marking = CapcoMarking::new(attrs);
5141 assert!(
5142 !absorbs_hard_splitter_in_sar_or_sci(&marking),
5143 "marking without SAR/SCI must not trigger the penalty"
5144 );
5145 }
5146
5147 #[test]
5148 fn decoder_resolves_sar_with_trailing_noforn_via_absorption_penalty() {
5149 // The SC-004 fixtures `SAR-BP-J12 …` and
5150 // `SPECIAL ACCESS REQUIRED-BUTTER POPCORN …` with a trailing
5151 // NOFORN have always produced the right candidate bytes from
5152 // `try_insert_delimiter`, but lost the scoring contest before
5153 // PR-5 because the absorbing strict parse contributed only the
5154 // classification's prior while the delim-inserted parse paid
5155 // the additional log-prior of NF. The
5156 // `HARD_SPLITTER_ABSORPTION_PENALTY` flips the contest; this
5157 // test pins both fixture shapes.
5158 let rx = DecoderRecognizer::new();
5159 for input in &[
5160 "TOP SECRET//SPECIAL ACCESS REQUIRED-BUTTER POPCORN NOFORN",
5161 "SECRET//SAR-BP-J12 J54-K15/CD-YYY 456 689/XR-XRA RB NOFORN",
5162 ] {
5163 let parsed = rx.recognize(input.as_bytes(), &deep_cx());
5164 match parsed {
5165 Parsed::Unambiguous(m) => {
5166 assert!(
5167 m.0.sar_markings.is_some(),
5168 "input {input:?}: expected SAR present in winning candidate"
5169 );
5170 // PR #178 review (Copilot, decoder.rs:2841): assert
5171 // the SPECIFIC dissem control we expect — `Nf`.
5172 // The previous `!is_empty()` check would silently
5173 // accept a future regression that emitted a
5174 // different dissem token (e.g., a misclassified
5175 // `Oc`/`Pr`) and still call the test green.
5176 assert!(
5177 m.0.dissem_controls
5178 .iter()
5179 .any(|d| matches!(d, marque_ism::DissemControl::Nf)),
5180 "input {input:?}: expected NOFORN (DissemControl::Nf) to land \
5181 as a dissem control (winning candidate must be the delim-\
5182 inserted form, not the absorbing one); got dissem_controls = \
5183 {:?}",
5184 m.0.dissem_controls,
5185 );
5186 assert!(
5187 !absorbs_hard_splitter_in_sar_or_sci(&m),
5188 "input {input:?}: winning marking must not bury a hard \
5189 splitter inside SAR/SCI"
5190 );
5191 }
5192 other => panic!("input {input:?}: expected Unambiguous, got {other:?}"),
5193 }
5194 }
5195 }
5196
5197 #[test]
5198 fn decoder_rejects_trivial_strict_parse() {
5199 // The strict parser is lenient: it accepts `FROBNITZ//WIBBLE`
5200 // and emits an IsmAttributes with classification=None,
5201 // dissem_controls=[], sci_controls=[]. The decoder must treat
5202 // that as "no real parse" and drop the candidate — otherwise
5203 // it would fabricate an empty marking for arbitrary prose.
5204 let token_set = CapcoTokenSet;
5205 let parser = Parser::new(&token_set);
5206 let candidate = MarkingCandidate {
5207 span: Span::new(0, 16),
5208 kind: MarkingType::Banner,
5209 };
5210 let parsed = parser
5211 .parse(&candidate, b"FROBNITZ//WIBBLE")
5212 .expect("strict parser should accept arbitrary bytes");
5213 let marking = CapcoMarking::new(parsed.attrs);
5214 assert!(
5215 !is_nontrivial_marking(&marking),
5216 "empty marking must be filtered"
5217 );
5218 }
5219
5220 #[test]
5221 fn decoder_recovers_typo_sercet_to_secret() {
5222 let rx = DecoderRecognizer::new();
5223 match rx.recognize(b"SERCET//NOFORN", &deep_cx()) {
5224 Parsed::Unambiguous(m) => {
5225 assert_eq!(
5226 marking_classification(&m),
5227 Some(Classification::Secret),
5228 "expected SECRET classification from SERCET fuzzy-correction"
5229 );
5230 }
5231 other => panic!("expected Unambiguous(SECRET//NOFORN), got {other:?}"),
5232 }
5233 }
5234
5235 #[test]
5236 fn decoder_recovers_case_mangled_input() {
5237 let rx = DecoderRecognizer::new();
5238 match rx.recognize(b"secret//noforn", &deep_cx()) {
5239 Parsed::Unambiguous(m) => {
5240 assert_eq!(marking_classification(&m), Some(Classification::Secret));
5241 }
5242 other => panic!("expected Unambiguous, got {other:?}"),
5243 }
5244 }
5245
5246 #[test]
5247 fn decoder_suppresses_prose_glue_single_letter_portion() {
5248 // Prose-glue heuristic: when the byte preceding the candidate
5249 // is NOT whitespace, a single-letter `(s)` / `(c)` is
5250 // overwhelmingly a plural-suffix (`letter(s)`) or function-
5251 // call glyph (`function(c)`). The decoder must produce zero
5252 // candidates so the engine doesn't synthesize a spurious R001
5253 // diagnostic.
5254 let rx = DecoderRecognizer::new();
5255 let glued = ParseContext {
5256 preceded_by_whitespace: false,
5257 ..deep_cx()
5258 };
5259 for input in &[b"(s)", b"(c)", b"(u)", b"(S)", b"(C)"] {
5260 match rx.recognize(*input, &glued) {
5261 Parsed::Ambiguous { candidates } => assert!(
5262 candidates.is_empty(),
5263 "{:?} glued to a word must produce zero candidates, got {}",
5264 std::str::from_utf8(*input).unwrap_or("<bytes>"),
5265 candidates.len(),
5266 ),
5267 Parsed::Unambiguous(_) => panic!(
5268 "{:?} glued to a word must not resolve",
5269 std::str::from_utf8(*input).unwrap_or("<bytes>"),
5270 ),
5271 }
5272 }
5273 }
5274
5275 #[test]
5276 fn decoder_canonicalizes_single_letter_when_preceded_by_whitespace() {
5277 // Counterpart to the prose-glue test: when
5278 // `preceded_by_whitespace = true` (the engine's start-of-buffer
5279 // / post-whitespace convention), single-letter portions still
5280 // canonicalize through the case-fold path. The heuristic only
5281 // suppresses the glued-to-a-word shape; mid-prose with leading
5282 // whitespace remains the decoder's responsibility (and is
5283 // governed separately by future per-token null-hypothesis
5284 // priors — see issue #258).
5285 let rx = DecoderRecognizer::new();
5286 match rx.recognize(b"(s)", &deep_cx()) {
5287 Parsed::Unambiguous(m) => {
5288 assert_eq!(
5289 marking_classification(&m),
5290 Some(Classification::Secret),
5291 "lowercase (s) with preceded_by_whitespace=true must \
5292 canonicalize to SECRET via the case-fold path"
5293 );
5294 }
5295 other => panic!("expected Unambiguous resolution, got {other:?}"),
5296 }
5297 }
5298
5299 #[test]
5300 fn decoder_rejects_bare_restricted_via_recognizer_predicate() {
5301 // `(R)` parses cleanly under the strict path's lenient
5302 // grammar but fails `is_us_restricted` at
5303 // both the strict recognizer and inside the decoder's
5304 // candidate loop (step 3c-bis). The decoder must produce
5305 // zero candidates regardless of preceded-by-whitespace.
5306 let rx = DecoderRecognizer::new();
5307 for cx in &[
5308 deep_cx(),
5309 ParseContext {
5310 preceded_by_whitespace: false,
5311 ..deep_cx()
5312 },
5313 ] {
5314 match rx.recognize(b"(r)", cx) {
5315 Parsed::Ambiguous { candidates } => assert!(
5316 candidates.is_empty(),
5317 "bare (r) must be zero-candidate (preceded_by_whitespace={}), got {}",
5318 cx.preceded_by_whitespace,
5319 candidates.len()
5320 ),
5321 Parsed::Unambiguous(m) => panic!(
5322 "bare (r) must be rejected, got Unambiguous({:?})",
5323 m.0.classification
5324 ),
5325 }
5326 }
5327 }
5328
5329 #[test]
5330 fn decoder_recovers_superseded_comint_to_si() {
5331 let rx = DecoderRecognizer::new();
5332 // SECRET//COMINT//NOFORN — COMINT is CAPCO-2016 §A.6 p16-superseded to SI.
5333 match rx.recognize(b"SECRET//COMINT//NOFORN", &deep_cx()) {
5334 Parsed::Unambiguous(m) => {
5335 assert_eq!(marking_classification(&m), Some(Classification::Secret));
5336 // Verify SI is in the SCI controls list after correction.
5337 let has_si =
5338 m.0.sci_controls
5339 .iter()
5340 .any(|c| matches!(c, marque_ism::SciControl::Si));
5341 assert!(
5342 has_si,
5343 "expected SI in sci_controls after COMINT supersession"
5344 );
5345 }
5346 other => panic!("expected Unambiguous, got {other:?}"),
5347 }
5348 }
5349
5350 #[test]
5351 fn decoder_recovers_reordered_banner() {
5352 let rx = DecoderRecognizer::new();
5353 // Dissem-first mangled; canonical is classification-first.
5354 match rx.recognize(b"NOFORN//SECRET", &deep_cx()) {
5355 Parsed::Unambiguous(m) => {
5356 assert_eq!(marking_classification(&m), Some(Classification::Secret));
5357 }
5358 Parsed::Ambiguous { candidates } => {
5359 assert!(
5360 !candidates.is_empty(),
5361 "reordering should at least surface candidates"
5362 );
5363 }
5364 }
5365 }
5366
5367 #[test]
5368 fn decoder_honors_classification_floor_fr011() {
5369 let rx = DecoderRecognizer::new();
5370 // Input is "(U)" which canonicalizes to an UNCLASSIFIED
5371 // portion. With a Secret floor, the candidate must be
5372 // dropped.
5373 let cx = ParseContext {
5374 strict_evidence: false,
5375 zone: None,
5376 position: None,
5377 classification_floor: Some(Classification::Secret as u8),
5378 as_of: None,
5379 preceded_by_whitespace: true,
5380 };
5381 match rx.recognize(b"(U)", &cx) {
5382 Parsed::Ambiguous { candidates } => assert!(
5383 candidates.is_empty(),
5384 "UNCLASSIFIED below SECRET floor must produce zero candidates, got {}",
5385 candidates.len()
5386 ),
5387 Parsed::Unambiguous(m) => panic!(
5388 "expected zero-candidate, got Unambiguous({:?})",
5389 marking_classification(&m)
5390 ),
5391 }
5392 }
5393
5394 #[test]
5395 fn decoder_classification_floor_allows_equal_or_above() {
5396 let rx = DecoderRecognizer::new();
5397 // (S//NF) with Confidential floor — SECRET exceeds floor.
5398 let cx = ParseContext {
5399 strict_evidence: false,
5400 zone: None,
5401 position: None,
5402 classification_floor: Some(Classification::Confidential as u8),
5403 as_of: None,
5404 preceded_by_whitespace: true,
5405 };
5406 match rx.recognize(b"(S//NF)", &cx) {
5407 Parsed::Unambiguous(m) => {
5408 assert_eq!(marking_classification(&m), Some(Classification::Secret));
5409 }
5410 other => panic!("expected Unambiguous, got {other:?}"),
5411 }
5412 }
5413
5414 #[test]
5415 fn normalize_delimiters_collapses_garbled_slash() {
5416 let (out, _) = normalize_delimiters_and_case("S ∕∕ NOFORN");
5417 assert_eq!(out, "S//NOFORN");
5418 }
5419
5420 #[test]
5421 fn scan_token_captures_compound_with_hyphen() {
5422 assert_eq!(scan_token("SI-G ABCD"), 4); // "SI-G"
5423 assert_eq!(scan_token("HCS-P"), 5);
5424 assert_eq!(scan_token("SECRET//"), 6);
5425 }
5426
5427 #[test]
5428 fn try_canonical_reorder_swaps_dissem_first_banner() {
5429 assert_eq!(
5430 try_canonical_reorder("NOFORN//SECRET"),
5431 Some("SECRET//NOFORN".to_owned())
5432 );
5433 }
5434
5435 #[test]
5436 fn try_canonical_reorder_returns_none_when_already_canonical() {
5437 assert_eq!(try_canonical_reorder("SECRET//NOFORN"), None);
5438 }
5439
5440 #[test]
5441 fn classify_segment_treats_sci_as_other_not_dissem() {
5442 // HCS and SI are SCI controls per CAPCO §A.6, not dissem.
5443 // Regression guard for PR #114 review — previously HCS was
5444 // in `DISSEMS`, which caused `try_canonical_reorder` to
5445 // move an HCS segment to the very end of the banner/portion
5446 // (past the dissem block) and corrupt canonicalization.
5447 // SCI segments must fall through to `SegmentClass::Other`
5448 // so the reorder helper places them between classification
5449 // and dissem per §A.6.
5450 assert_eq!(classify_segment("HCS"), SegmentClass::Other);
5451 assert_eq!(classify_segment("HCS-P"), SegmentClass::Other);
5452 assert_eq!(classify_segment("SI"), SegmentClass::Other);
5453 assert_eq!(classify_segment("SI-G"), SegmentClass::Other);
5454 assert_eq!(classify_segment("TK"), SegmentClass::Other);
5455 }
5456
5457 #[test]
5458 fn classify_segment_non_ic_dissem_tokens() {
5459 // §H.9 abbreviations and long-title forms must classify as Dissem so
5460 // try_canonical_reorder places them after SCI, not in Other.
5461 // Regression guard for PR #256.
5462 for tok in &[
5463 "DS", "XD", "ND", "SBU", "SBU-NF", "LES", "LES-NF", "SSI", "LIMDIS", "EXDIS", "NODIS",
5464 ] {
5465 assert_eq!(
5466 classify_segment(tok),
5467 SegmentClass::Dissem,
5468 "classify_segment({tok:?}) should be Dissem"
5469 );
5470 }
5471 // Multi-word long-title forms.
5472 assert_eq!(
5473 classify_segment("LIMITED DISTRIBUTION"),
5474 SegmentClass::Dissem
5475 );
5476 assert_eq!(
5477 classify_segment("EXCLUSIVE DISTRIBUTION"),
5478 SegmentClass::Dissem
5479 );
5480 assert_eq!(classify_segment("NO DISTRIBUTION"), SegmentClass::Dissem);
5481 assert_eq!(
5482 classify_segment("LAW ENFORCEMENT SENSITIVE"),
5483 SegmentClass::Dissem
5484 );
5485 assert_eq!(
5486 classify_segment("SENSITIVE BUT UNCLASSIFIED"),
5487 SegmentClass::Dissem
5488 );
5489 assert_eq!(
5490 classify_segment("SENSITIVE SECURITY INFORMATION"),
5491 SegmentClass::Dissem
5492 );
5493 }
5494
5495 #[test]
5496 fn classify_segment_restricted_data_is_not_classification() {
5497 // "RESTRICTED DATA" (AEA, §H.6) must not be mistaken for the NATO
5498 // RESTRICTED classification even though "RESTRICTED" is in CLASSIFICATIONS.
5499 // Bare "RESTRICTED" (NATO classification) must still be Classification.
5500 // Regression guard for PR #256.
5501 assert_eq!(classify_segment("RESTRICTED DATA"), SegmentClass::Other);
5502 assert_eq!(
5503 classify_segment("RESTRICTED DATA-CNWDI"),
5504 SegmentClass::Other
5505 );
5506 assert_eq!(classify_segment("RESTRICTED"), SegmentClass::Classification);
5507 }
5508
5509 #[test]
5510 fn try_canonical_reorder_places_sci_between_classification_and_dissem() {
5511 // Dissem-first with an SCI segment in the middle — correct
5512 // canonical order is classification → SCI → dissem.
5513 assert_eq!(
5514 try_canonical_reorder("NOFORN//HCS-P//SECRET"),
5515 Some("SECRET//HCS-P//NOFORN".to_owned())
5516 );
5517 }
5518
5519 #[test]
5520 fn meets_classification_floor_rejects_below_floor() {
5521 // Synthesize a marking via the decoder and check the floor
5522 // predicate directly.
5523 let rx = DecoderRecognizer::new();
5524 let Parsed::Unambiguous(u_marking) = rx.recognize(b"(U)", &deep_cx()) else {
5525 panic!("(U) should decode to unambiguous UNCLASSIFIED");
5526 };
5527 // U below S floor → rejected.
5528 assert!(!meets_classification_floor(
5529 &u_marking,
5530 Classification::Secret as u8
5531 ));
5532 // U meets U floor.
5533 assert!(meets_classification_floor(
5534 &u_marking,
5535 Classification::Unclassified as u8
5536 ));
5537 }
5538
5539 // ----- SAR indicator-keyword structural repair (issue #133 PR 6) -----
5540
5541 #[test]
5542 fn sar_indicator_repair_strips_one_letter_prefix() {
5543 // The canonical USAR-BP shape from the mangled corpus.
5544 assert_eq!(
5545 try_sar_indicator_repair(
5546 "SECRET//USAR-BP-J12 J54-K15/CD-YYY 456 689/XR-XRA RB//NOFORN"
5547 ),
5548 Some("SECRET//SAR-BP-J12 J54-K15/CD-YYY 456 689/XR-XRA RB//NOFORN".to_owned())
5549 );
5550 }
5551
5552 #[test]
5553 fn sar_indicator_repair_strips_multi_letter_prefix() {
5554 // Two- and three-letter prefixes are still in the structural
5555 // window. `XYZ` isn't a CAPCO token or trigraph.
5556 assert_eq!(
5557 try_sar_indicator_repair("SECRET//ABSAR-BP//NOFORN"),
5558 Some("SECRET//SAR-BP//NOFORN".to_owned())
5559 );
5560 assert_eq!(
5561 try_sar_indicator_repair("SECRET//XYZSAR-BP//NOFORN"),
5562 Some("SECRET//SAR-BP//NOFORN".to_owned())
5563 );
5564 }
5565
5566 #[test]
5567 fn sar_indicator_repair_strips_even_capco_token_prefix() {
5568 // The prefix-strip pass intentionally does NOT defend
5569 // against prefixes that spell a CAPCO token in isolation
5570 // (`U`, `S`, `R`, `C`, `TS`, `SI`, `USA`, …). Canonical
5571 // CAPCO never glues a classification token, SCI control,
5572 // or trigraph directly to `SAR-` without a `//` separator,
5573 // so the apparent prefix at a `//`/`(`/start boundary is
5574 // OCR/transcription drift regardless of whether the bytes
5575 // happen to spell a known token. An earlier defensive check
5576 // that refused to strip such prefixes broke the central
5577 // `USAR-` recovery case (`U` is the UNCLASSIFIED portion
5578 // form). Pinned here so a future "be more conservative"
5579 // PR reviews the rationale before re-adding the guard.
5580 assert_eq!(
5581 try_sar_indicator_repair("SECRET//USASAR-BP//NOFORN"),
5582 Some("SECRET//SAR-BP//NOFORN".to_owned()),
5583 "must strip USA at boundary even though USA is a trigraph",
5584 );
5585 assert_eq!(
5586 try_sar_indicator_repair("(USAR-BP)"),
5587 Some("(SAR-BP)".to_owned()),
5588 "boundary `(` must also trigger the strip pass",
5589 );
5590 }
5591
5592 #[test]
5593 fn sar_indicator_repair_inserts_missing_hyphen_two_char_id() {
5594 // The canonical SARBP missing-hyphen shape.
5595 assert_eq!(
5596 try_sar_indicator_repair("TOP SECRET//SARBP//NOFORN"),
5597 Some("TOP SECRET//SAR-BP//NOFORN".to_owned())
5598 );
5599 }
5600
5601 #[test]
5602 fn sar_indicator_repair_inserts_missing_hyphen_three_char_id() {
5603 // 3-char alphanumeric program identifier per §H.5 p100.
5604 assert_eq!(
5605 try_sar_indicator_repair("TOP SECRET//SARABC//NOFORN"),
5606 Some("TOP SECRET//SAR-ABC//NOFORN".to_owned())
5607 );
5608 }
5609
5610 #[test]
5611 fn sar_indicator_repair_inserts_missing_hyphen_before_compound() {
5612 // `SARBP-J12` → `SAR-BP-J12`. The 2-char alnum run BP
5613 // terminates at the `-` delimiter; that's the missing-hyphen
5614 // pattern. The trailing `-J12` is preserved verbatim.
5615 assert_eq!(
5616 try_sar_indicator_repair("SECRET//SARBP-J12 J54//NOFORN"),
5617 Some("SECRET//SAR-BP-J12 J54//NOFORN".to_owned())
5618 );
5619 }
5620
5621 #[test]
5622 fn sar_indicator_repair_no_op_on_canonical() {
5623 // Canonical SAR shapes must pass through with `None`.
5624 let cases: &[&str] = &[
5625 "SECRET//SAR-BP//NOFORN",
5626 "SECRET//SAR-BP-J12 J54-K15/CD-YYY 456 689/XR-XRA RB//NOFORN",
5627 "TOP SECRET//SPECIAL ACCESS REQUIRED-BUTTER POPCORN//NOFORN",
5628 "SECRET//NOFORN",
5629 ];
5630 for input in cases {
5631 assert_eq!(
5632 try_sar_indicator_repair(input),
5633 None,
5634 "canonical input {input:?} must not be repaired"
5635 );
5636 }
5637 }
5638
5639 #[test]
5640 fn sar_indicator_repair_skips_non_boundary_sar() {
5641 // `SAR` embedded mid-token (no boundary char before `S`)
5642 // is not the indicator — could be a SAR program identifier
5643 // happening to contain the letters. Don't touch.
5644 assert_eq!(
5645 try_sar_indicator_repair("SECRET//FOO-USAR-BP"),
5646 None,
5647 "non-boundary SAR is not the indicator keyword"
5648 );
5649 }
5650
5651 #[test]
5652 fn sar_indicator_repair_skips_long_alnum_run() {
5653 // 4+ alphanumeric chars after SAR don't match the §H.5 p100
5654 // 2-3 char Abbrev-form identifier. The helper refuses to
5655 // insert a hyphen — inserting `SAR-ABCD` would be inventing
5656 // a malformed identifier.
5657 assert_eq!(
5658 try_sar_indicator_repair("SECRET//SARABCD//NOFORN"),
5659 None,
5660 "4-char alnum run violates §H.5 p100 2-3 char identifier"
5661 );
5662 }
5663
5664 #[test]
5665 fn sar_indicator_repair_returns_none_when_no_sar_substring() {
5666 // Pre-check fast path: if `SAR` doesn't appear in the input
5667 // at all, no repair is possible.
5668 assert_eq!(
5669 try_sar_indicator_repair("TOP SECRET//SI-G ABCD//NOFORN"),
5670 None
5671 );
5672 assert_eq!(try_sar_indicator_repair(""), None);
5673 assert_eq!(try_sar_indicator_repair("UNCLASSIFIED"), None);
5674 }
5675
5676 #[test]
5677 fn match_sar_prefix_detects_one_to_three_letter_prefix() {
5678 assert_eq!(match_sar_prefix(b"USAR-BP", 0), Some((1, 5)));
5679 assert_eq!(match_sar_prefix(b"ABSAR-BP", 0), Some((2, 6)));
5680 assert_eq!(match_sar_prefix(b"XYZSAR-BP", 0), Some((3, 7)));
5681 }
5682
5683 #[test]
5684 fn match_sar_prefix_rejects_no_prefix_or_no_sar() {
5685 assert_eq!(match_sar_prefix(b"SAR-BP", 0), None);
5686 assert_eq!(match_sar_prefix(b"USAR", 0), None);
5687 assert_eq!(match_sar_prefix(b"USARBP", 0), None);
5688 }
5689
5690 #[test]
5691 fn match_sar_missing_hyphen_detects_2_3_char_id() {
5692 assert_eq!(match_sar_missing_hyphen(b"SARBP/", 0), Some(5));
5693 assert_eq!(match_sar_missing_hyphen(b"SARABC ", 0), Some(6));
5694 // End-of-string also counts as a delim.
5695 assert_eq!(match_sar_missing_hyphen(b"SARBP", 0), Some(5));
5696 }
5697
5698 #[test]
5699 fn match_sar_missing_hyphen_rejects_canonical_and_too_long() {
5700 // `SAR-` already canonical (alnum run is 0).
5701 assert_eq!(match_sar_missing_hyphen(b"SAR-BP", 0), None);
5702 // 4-char alnum run is outside the §H.5 p100 2-3 window.
5703 assert_eq!(match_sar_missing_hyphen(b"SARABCD/", 0), None);
5704 // 1-char alnum run is also outside the window.
5705 assert_eq!(match_sar_missing_hyphen(b"SARB/", 0), None);
5706 }
5707
5708 #[test]
5709 fn match_sar_missing_hyphen_rejects_non_delim_following_char() {
5710 // Alnum run is in the §H.5 p100 2-3 window, but the byte
5711 // immediately after the run is non-alphanumeric AND not in
5712 // the delimiter set (`-`, `/`, ` `, `\t`, `\n`, `\r`).
5713 // Every non-delim non-alnum byte triggers the
5714 // `next_is_delim = false` branch and the helper returns
5715 // `None` — refusing to repair grammatically-suspicious
5716 // shapes (a SAR identifier doesn't terminate at `,`, `)`,
5717 // `;`, etc.). Direct-helper test because the higher-level
5718 // pinning in `try_sar_indicator_repair` only exercises a
5719 // subset of these via the boundary check upstream.
5720 let cases: &[&[u8]] = &[
5721 b"SARBP)", // closing paren — same byte that ends a portion mark
5722 b"SARBP,", // comma — common typo separator
5723 b"SARBP;", // semicolon
5724 b"SARBP*", // asterisk
5725 b"SARBP=", // equals
5726 b"SARABC.", // period after 3-char id
5727 b"SARABC?", // question mark
5728 ];
5729 for input in cases {
5730 assert_eq!(
5731 match_sar_missing_hyphen(input, 0),
5732 None,
5733 "input {:?} has non-delim follower; helper must refuse repair",
5734 std::str::from_utf8(input).unwrap_or("<non-utf8>"),
5735 );
5736 }
5737 }
5738
5739 #[test]
5740 fn sar_indicator_repair_skips_pattern_b_with_non_delim_follower() {
5741 // End-to-end pinning of the same `next_is_delim = false`
5742 // rejection through `try_sar_indicator_repair`. `SARBP)`
5743 // appears at a `//` boundary (so `at_boundary` is true and
5744 // Pattern B is attempted), the alnum run is 2, but `)` isn't
5745 // in the delim set — the helper falls through to the
5746 // verbatim-copy default. Without the rejection branch we'd
5747 // emit `SAR-BP)`, silently inventing a hyphen for a
5748 // grammatically-suspicious input.
5749 assert_eq!(
5750 try_sar_indicator_repair("SECRET//SARBP)//NOFORN"),
5751 None,
5752 "Pattern B must refuse to fire when the post-alnum char isn't a delim",
5753 );
5754 }
5755
5756 // ----- Stray-character `/X/` recovery (issue #133 PR 7) -----
5757
5758 #[test]
5759 fn try_collapse_stray_char_slash_emits_three_transforms() {
5760 // Each `/X/` match emits exactly three candidate bytes
5761 // (drop, right-attach, left-attach). This pins the contract
5762 // and makes any future scope expansion (multi-pass, extra
5763 // transforms) a deliberate, reviewable change.
5764 let result = try_collapse_stray_char_slash("AB/X/CD");
5765 assert_eq!(result.len(), 3, "expected 3 candidates; got {result:?}");
5766 assert_eq!(result[0], "AB//CD"); // drop X
5767 assert_eq!(result[1], "AB//XCD"); // right-attach X to CD
5768 assert_eq!(result[2], "ABX//CD"); // left-attach X to AB
5769 }
5770
5771 #[test]
5772 fn try_collapse_stray_char_slash_returns_empty_when_no_pattern() {
5773 // Inputs without a `/X/` pattern produce no candidates.
5774 let cases: &[&str] = &[
5775 "SECRET",
5776 "SECRET//NOFORN",
5777 "SECRET//NOFORN//EXDIS",
5778 "(C)",
5779 "",
5780 // A `/` followed by 2+ alnum chars is NOT the pattern —
5781 // `/AB/` is a regular 2-char token between slashes.
5782 "SECRET/AB/CD",
5783 // `//` (canonical separator) doesn't match because the
5784 // single-char-between-slashes shape requires alnum at
5785 // bytes[i+1].
5786 "SECRET////NOFORN",
5787 ];
5788 for input in cases {
5789 assert!(
5790 try_collapse_stray_char_slash(input).is_empty(),
5791 "input {input:?} should not match /X/ pattern",
5792 );
5793 }
5794 }
5795
5796 #[test]
5797 fn try_collapse_stray_char_slash_requires_alnum_boundary() {
5798 // The pattern requires alnum on both sides of `/X/`. Without
5799 // both, the recovery is semantically meaningless (no token
5800 // to attach X to / no token next to the strip).
5801 // Leading boundary missing: `/X/Y` at position 0 has no
5802 // alnum at i-1.
5803 assert!(try_collapse_stray_char_slash("/X/Y").is_empty());
5804 // Trailing boundary missing: `Y/X/` has no alnum at i+3.
5805 assert!(try_collapse_stray_char_slash("Y/X/").is_empty());
5806 // Both alnum: matches.
5807 assert_eq!(
5808 try_collapse_stray_char_slash("Y/X/Z").len(),
5809 3,
5810 "alnum on both sides should match"
5811 );
5812 }
5813
5814 // ----- REL TO structural repair (issue #133 PR 9) -----
5815
5816 #[test]
5817 fn rel_to_header_normalize_fixes_rel_ot_transposition() {
5818 // Pattern 1: `REL OT ` (TO → OT) → `REL TO `.
5819 let result = try_rel_to_header_normalize("SECRET//REL OT USA, AUS, GBR");
5820 assert_eq!(
5821 result.as_deref(),
5822 Some("SECRET//REL TO USA, AUS, GBR"),
5823 "REL OT must rewrite to REL TO at //-boundary",
5824 );
5825 }
5826
5827 #[test]
5828 fn rel_to_header_normalize_fixes_relt_o_token_boundary() {
5829 // Pattern 2: `RELT O ` (T migrated from REL to start of next
5830 // token) → `REL TO `. The fuzzy pass would otherwise rewrite
5831 // `RELT` (4 chars) → `REL` (in-vocab DissemControl, distance
5832 // 1) and silently drop USA from the strict parse.
5833 let result = try_rel_to_header_normalize("SECRET//RELT O USA, AUS, GBR");
5834 assert_eq!(
5835 result.as_deref(),
5836 Some("SECRET//REL TO USA, AUS, GBR"),
5837 "RELT O must rewrite to REL TO at //-boundary",
5838 );
5839 }
5840
5841 #[test]
5842 fn rel_to_header_normalize_returns_none_on_canonical() {
5843 // Canonical `REL TO ` (and texts without REL at all) round-
5844 // trip unchanged.
5845 assert!(try_rel_to_header_normalize("SECRET//REL TO USA, AUS, GBR").is_none());
5846 assert!(try_rel_to_header_normalize("SECRET//NOFORN").is_none());
5847 assert!(try_rel_to_header_normalize("").is_none());
5848 }
5849
5850 #[test]
5851 fn rel_to_header_normalize_requires_token_boundary() {
5852 // The pattern must not fire when embedded inside a longer
5853 // alphanumeric run. Without the boundary check, `XREL OT Y`
5854 // would match the substring `REL OT` even though the leading
5855 // `X` makes the whole thing a single 6-char token.
5856 assert!(try_rel_to_header_normalize("XREL OT Y").is_none());
5857 assert!(try_rel_to_header_normalize("SOMETHINGRELT O Y").is_none());
5858 }
5859
5860 #[test]
5861 fn rel_to_entry_normalize_joins_a_us_to_aus() {
5862 // Pattern 3: 4-char entry `A US` joins to AUS only when the
5863 // joined 3-letter string is a known trigraph. AUS is a
5864 // trigraph; A alone is not.
5865 let result = try_rel_to_entry_normalize("SECRET//REL TO USA,A US, GBR");
5866 // The replacement preserves the entry's leading whitespace
5867 // (none here), so the rewritten block is `USA,AUS, GBR`.
5868 assert_eq!(
5869 result.as_deref(),
5870 Some("SECRET//REL TO USA,AUS, GBR"),
5871 "A US should join to AUS when is_trigraph(AUS) holds",
5872 );
5873 }
5874
5875 #[test]
5876 fn rel_to_entry_normalize_swaps_au_comma_s_to_aus_comma() {
5877 // Pattern 4: `<2-upper>,<1-upper><space>` swaps to
5878 // `<3-upper joined>,` only when the joined trigraph is
5879 // valid AND the 2-letter prefix alone is not a trigraph.
5880 let result = try_rel_to_entry_normalize("SECRET//REL TO USA, AU,S GBR");
5881 assert_eq!(
5882 result.as_deref(),
5883 Some("SECRET//REL TO USA, AUS, GBR"),
5884 "AU,S should swap to AUS, when is_trigraph(AUS) holds and AU is not a trigraph",
5885 );
5886 }
5887
5888 #[test]
5889 fn rel_to_entry_normalize_does_not_corrupt_eu_comma_pattern() {
5890 // EU is itself a valid 2-char trigraph entry. Pattern 4 must
5891 // not fire on `EU,X ` because `is_trigraph(EU)` is true —
5892 // this guards the rule "only fix when the prefix alone is
5893 // invalid". (Even though `EUX` may not be a trigraph and
5894 // wouldn't pass the join-is-trigraph guard either, the
5895 // prefix-is-trigraph check is the cleaner discriminator.)
5896 let result = try_rel_to_entry_normalize("SECRET//REL TO USA, EU, GBR");
5897 assert!(
5898 result.is_none(),
5899 "canonical EU entry must round-trip unchanged",
5900 );
5901 }
5902
5903 #[test]
5904 fn rel_to_entry_normalize_returns_none_outside_rel_to() {
5905 // No REL TO header → no entry-pass fixes. The patterns are
5906 // scoped to inside REL TO blocks specifically.
5907 assert!(try_rel_to_entry_normalize("SECRET//SI/TK//NOFORN").is_none());
5908 assert!(try_rel_to_entry_normalize("").is_none());
5909 }
5910
5911 #[test]
5912 fn rel_to_structural_repair_short_circuits_without_rel() {
5913 // Pre-check: text without `REL` returns None immediately,
5914 // skipping the byte walks.
5915 assert!(try_rel_to_structural_repair("SECRET//NOFORN").is_none());
5916 assert!(try_rel_to_structural_repair("(C)").is_none());
5917 assert!(try_rel_to_structural_repair("").is_none());
5918 }
5919
5920 // ----- SCI delimiter recovery (issue #198, #133 PR 10) -----
5921
5922 #[test]
5923 fn sci_delimiter_repair_concatenated_compound_hcsp() {
5924 // Pattern A: `HCSP` (registered compound `HCS-P` with hyphen
5925 // missing) → `HCS-P`.
5926 let result = try_sci_delimiter_repair("SECRET//HCSP//NOFORN");
5927 assert_eq!(
5928 result.as_deref(),
5929 Some("SECRET//HCS-P//NOFORN"),
5930 "HCSP must rewrite to HCS-P (CVE-registered compound)",
5931 );
5932 }
5933
5934 #[test]
5935 fn sci_delimiter_repair_concatenated_compound_hcso() {
5936 // Pattern A: HCSO → HCS-O.
5937 let result = try_sci_delimiter_repair("SECRET//HCSO//NOFORN");
5938 assert_eq!(result.as_deref(), Some("SECRET//HCS-O//NOFORN"));
5939 }
5940
5941 #[test]
5942 fn sci_delimiter_repair_concatenated_compound_sig() {
5943 // Pattern A: SIG → SI-G. The CVE list has SI-G; G is a
5944 // compartment of SI per §A.6 p16.
5945 let result = try_sci_delimiter_repair("SECRET//SIG//NOFORN");
5946 assert_eq!(result.as_deref(), Some("SECRET//SI-G//NOFORN"));
5947 }
5948
5949 #[test]
5950 fn sci_delimiter_repair_concatenated_compound_tkkand() {
5951 // Pattern A: TKKAND → TK-KAND. Tests that the longer
5952 // concatenated forms (6 chars) are matched correctly.
5953 let result = try_sci_delimiter_repair("SECRET//TKKAND//NOFORN");
5954 assert_eq!(result.as_deref(), Some("SECRET//TK-KAND//NOFORN"));
5955 }
5956
5957 #[test]
5958 fn sci_delimiter_repair_schema_coverage_bur_compounds() {
5959 // Pattern A is schema-driven via `SciControl::parse`, so it
5960 // covers every CVE compound automatically — including the
5961 // BUR-* family that an earlier hand-maintained list omitted.
5962 // Locks in the schema-derived contract: any future ODNI
5963 // schema bump that adds new compounds is auto-covered without
5964 // changes to this file.
5965 assert_eq!(
5966 try_sci_delimiter_repair("SECRET//BURBLG//NOFORN").as_deref(),
5967 Some("SECRET//BUR-BLG//NOFORN"),
5968 );
5969 assert_eq!(
5970 try_sci_delimiter_repair("SECRET//BURDTP//NOFORN").as_deref(),
5971 Some("SECRET//BUR-DTP//NOFORN"),
5972 );
5973 assert_eq!(
5974 try_sci_delimiter_repair("SECRET//BURWRG//NOFORN").as_deref(),
5975 Some("SECRET//BUR-WRG//NOFORN"),
5976 );
5977 }
5978
5979 #[test]
5980 fn sci_delimiter_repair_missing_slash_sitk() {
5981 // Pattern B: SITK → SI/TK. Per §A.6 p16 + p194 example,
5982 // multiple control systems within an SCI category use `/`.
5983 let result = try_sci_delimiter_repair("SECRET//SITK//NOFORN");
5984 assert_eq!(
5985 result.as_deref(),
5986 Some("SECRET//SI/TK//NOFORN"),
5987 "SITK must rewrite to SI/TK (two bare control systems concatenated)",
5988 );
5989 }
5990
5991 #[test]
5992 fn sci_delimiter_repair_missing_slash_hcssi() {
5993 // Pattern B: HCSSI → HCS/SI. Tests 3+2 split (HCS is len 3,
5994 // SI is len 2).
5995 let result = try_sci_delimiter_repair("SECRET//HCSSI//NOFORN");
5996 assert_eq!(result.as_deref(), Some("SECRET//HCS/SI//NOFORN"));
5997 }
5998
5999 #[test]
6000 fn sci_delimiter_repair_wrong_delimiter_si_dash_tk() {
6001 // Pattern C: SI-TK → SI/TK. The whole token is not a CVE
6002 // compound, both halves are bare CS, so `-` is wrong.
6003 let result = try_sci_delimiter_repair("SECRET//SI-TK//NOFORN");
6004 assert_eq!(
6005 result.as_deref(),
6006 Some("SECRET//SI/TK//NOFORN"),
6007 "SI-TK must rewrite to SI/TK (two bare CS, `-` is for control-compartment)",
6008 );
6009 }
6010
6011 #[test]
6012 fn sci_delimiter_repair_leaves_registered_compound_alone() {
6013 // Pattern C must NOT fire on registered compounds. SI-G is in
6014 // CVEnumISMSCIControls.xml — `-` is the correct separator.
6015 assert!(try_sci_delimiter_repair("SECRET//SI-G//NOFORN").is_none());
6016 assert!(try_sci_delimiter_repair("SECRET//HCS-P//NOFORN").is_none());
6017 assert!(try_sci_delimiter_repair("SECRET//TK-KAND//NOFORN").is_none());
6018 }
6019
6020 #[test]
6021 fn sci_delimiter_repair_returns_none_on_canonical() {
6022 // Already-canonical inputs round-trip unchanged.
6023 assert!(try_sci_delimiter_repair("SECRET//SI/TK//NOFORN").is_none());
6024 assert!(try_sci_delimiter_repair("SECRET//SI//NOFORN").is_none());
6025 assert!(try_sci_delimiter_repair("SECRET//NOFORN").is_none());
6026 assert!(try_sci_delimiter_repair("").is_none());
6027 }
6028
6029 #[test]
6030 fn sci_delimiter_repair_does_not_fire_on_word_substring() {
6031 // SIGMA contains "SIG" as a substring but is a single token
6032 // — Pattern A requires whole-token equality, not contains.
6033 assert!(try_sci_delimiter_repair("SIGMA").is_none());
6034 // SITE, SITS — same protection.
6035 assert!(try_sci_delimiter_repair("SITE").is_none());
6036 // SIGNAL — contains SIG; whole token is not in Pattern A.
6037 assert!(try_sci_delimiter_repair("SIGNAL").is_none());
6038 }
6039
6040 #[test]
6041 fn sci_delimiter_repair_short_circuits_without_sci_root() {
6042 // Pre-check: no SCI control system substring → no repair.
6043 assert!(try_sci_delimiter_repair("CONFIDENTIAL//NOFORN").is_none());
6044 assert!(try_sci_delimiter_repair("(C)").is_none());
6045 assert!(try_sci_delimiter_repair("").is_none());
6046 }
6047
6048 #[test]
6049 fn sci_delimiter_repair_does_not_panic_on_non_ascii() {
6050 // The function must not panic on multi-byte UTF-8 input. The
6051 // SCI vocabulary is pure ASCII, so any non-ASCII input is
6052 // unmatchable — bail early rather than risk a byte-offset
6053 // slice landing mid-codepoint. Inputs intentionally chosen
6054 // to exercise both the outer scanner (`try_sci_delimiter_repair`)
6055 // and the inner per-token classifier (`repair_sci_token`).
6056 assert!(try_sci_delimiter_repair("SECRET//SI/TK//日本語").is_none());
6057 assert!(try_sci_delimiter_repair("Ω SI TK").is_none());
6058 assert!(try_sci_delimiter_repair("こんにちは").is_none());
6059 // Direct call to the per-token helper with non-ASCII content.
6060 assert!(repair_sci_token("SI日").is_none());
6061 assert!(repair_sci_token("日本").is_none());
6062 }
6063
6064 #[test]
6065 fn repair_sci_token_rejects_partial_decompositions() {
6066 // HCSI = HCS+I (I not bare) or H+CSI (neither bare) — no
6067 // valid Pattern B decomposition.
6068 assert!(repair_sci_token("HCSI").is_none());
6069 // ABCDE — random, no valid CS decomposition.
6070 assert!(repair_sci_token("ABCDE").is_none());
6071 // BUR alone — bare CS by itself, len 3, fails Pattern B's
6072 // 4..=6 length check, no `-`, not in Pattern A. Returns None.
6073 assert!(repair_sci_token("BUR").is_none());
6074 }
6075
6076 #[test]
6077 fn try_collapse_stray_char_slash_processes_only_first_match() {
6078 // PR 7 scope: only the first `/X/` is processed. Multi-
6079 // pattern inputs need a future multi-pass extension.
6080 let result = try_collapse_stray_char_slash("A/X/B/Y/C");
6081 assert_eq!(result.len(), 3);
6082 // Each candidate carries only the first transform — the
6083 // second `/Y/` pattern is left in place verbatim.
6084 assert_eq!(result[0], "A//B/Y/C"); // drop first X
6085 assert_eq!(result[1], "A//XB/Y/C"); // right-attach first X
6086 assert_eq!(result[2], "AX//B/Y/C"); // left-attach first X
6087 }
6088
6089 #[test]
6090 fn decoder_recovers_drop_stray_char() {
6091 // End-to-end: `SECRET//NOFORN/R/EXDIS` resolves to the
6092 // canonical `SECRET//NOFORN//EXDIS` via the drop-X transform.
6093 // The right-attach (`SECRET//NOFORN//REXDIS` — REXDIS unknown)
6094 // and left-attach (`SECRET//NOFORNR//EXDIS` — NOFORNR unknown)
6095 // candidates are dropped by step 3a's Unknown-token filter.
6096 // Pinned per `tests/fixtures/mangled/typo/7885156a2c2c125f.json`.
6097 let rx = DecoderRecognizer::new();
6098 let Parsed::Unambiguous(marking) = rx.recognize(b"SECRET//NOFORN/R/EXDIS", &deep_cx())
6099 else {
6100 panic!("`/R/` between NOFORN and EXDIS must resolve via drop-X");
6101 };
6102 assert_eq!(
6103 marking
6104 .0
6105 .classification
6106 .as_ref()
6107 .map(|c| c.effective_level()),
6108 Some(Classification::Secret),
6109 );
6110 assert!(
6111 marking
6112 .0
6113 .dissem_controls
6114 .iter()
6115 .any(|d| matches!(d, marque_ism::DissemControl::Nf)),
6116 "NOFORN must survive; attrs = {:?}",
6117 marking.0,
6118 );
6119 assert!(
6120 marking
6121 .0
6122 .non_ic_dissem
6123 .iter()
6124 .any(|d| matches!(d, marque_ism::NonIcDissem::Exdis)),
6125 "EXDIS must survive; attrs = {:?}",
6126 marking.0,
6127 );
6128 }
6129
6130 #[test]
6131 fn decoder_recovers_right_attach_stray_char() {
6132 // End-to-end: `TOP SECRET//SI/N/OFORN` resolves to the
6133 // canonical `TOP SECRET//SI//NOFORN` via right-attach (the
6134 // `N` was the leading char of NOFORN). The drop candidate
6135 // (`TOP SECRET//SI//OFORN` — OFORN unknown) and left-attach
6136 // (`TOP SECRET//SIN//OFORN` — both unknown) are dropped by
6137 // step 3a's Unknown-token filter. Pinned per
6138 // `tests/fixtures/mangled/typo/2cb13fe4682ff31c.json`.
6139 let rx = DecoderRecognizer::new();
6140 let Parsed::Unambiguous(marking) = rx.recognize(b"TOP SECRET//SI/N/OFORN", &deep_cx())
6141 else {
6142 panic!("`/N/` before OFORN must resolve via right-attach");
6143 };
6144 assert_eq!(
6145 marking
6146 .0
6147 .classification
6148 .as_ref()
6149 .map(|c| c.effective_level()),
6150 Some(Classification::TopSecret),
6151 );
6152 assert!(
6153 marking
6154 .0
6155 .sci_controls
6156 .iter()
6157 .any(|c| matches!(c, marque_ism::SciControl::Si)),
6158 "SI must survive; attrs = {:?}",
6159 marking.0,
6160 );
6161 assert!(
6162 marking
6163 .0
6164 .dissem_controls
6165 .iter()
6166 .any(|d| matches!(d, marque_ism::DissemControl::Nf)),
6167 "NOFORN must be reconstructed; attrs = {:?}",
6168 marking.0,
6169 );
6170 }
6171
6172 #[test]
6173 fn decoder_recovers_left_attach_stray_char() {
6174 // End-to-end: `SECRE/T/REL TO USA, AUS, GBR` resolves to the
6175 // canonical `SECRET//REL TO USA, AUS, GBR` via left-attach
6176 // (the `T` was the trailing char of SECRET). The drop
6177 // (`SECRE//REL TO ...` — SECRE unknown) and right-attach
6178 // (`SECRE//TREL TO ...` — both unknown) are dropped by
6179 // step 3a. Pinned per
6180 // `tests/fixtures/mangled/typo/cff1d0ac74e901c3.json`.
6181 let rx = DecoderRecognizer::new();
6182 let Parsed::Unambiguous(marking) =
6183 rx.recognize(b"SECRE/T/REL TO USA, AUS, GBR", &deep_cx())
6184 else {
6185 panic!("`/T/` after SECRE must resolve via left-attach");
6186 };
6187 assert_eq!(
6188 marking
6189 .0
6190 .classification
6191 .as_ref()
6192 .map(|c| c.effective_level()),
6193 Some(Classification::Secret),
6194 );
6195 assert_eq!(
6196 marking.0.rel_to.len(),
6197 3,
6198 "REL TO must carry 3 trigraphs (USA, AUS, GBR); attrs = {:?}",
6199 marking.0,
6200 );
6201 }
6202
6203 #[test]
6204 fn decoder_recovers_usar_prefix_via_sar_indicator_repair() {
6205 // End-to-end recognizer test: the canonical USAR-BP fixture
6206 // shape from the mangled corpus must resolve unambiguously
6207 // to a SECRET marking with a SAR block. Pinned per
6208 // `tests/fixtures/mangled/typo/d04f45f7a4f5a8b4.json`.
6209 let rx = DecoderRecognizer::new();
6210 let Parsed::Unambiguous(marking) = rx.recognize(
6211 b"SECRET//USAR-BP-J12 J54-K15/CD-YYY 456 689/XR-XRA RB//NOFORN",
6212 &deep_cx(),
6213 ) else {
6214 panic!("USAR-BP-... must resolve via SAR indicator repair");
6215 };
6216 assert_eq!(
6217 marking
6218 .0
6219 .classification
6220 .as_ref()
6221 .map(|c| c.effective_level()),
6222 Some(Classification::Secret),
6223 );
6224 assert!(
6225 marking.0.sar_markings.is_some(),
6226 "SAR block must be present after USAR→SAR repair; attrs = {:?}",
6227 marking.0,
6228 );
6229 assert!(
6230 marking
6231 .0
6232 .dissem_controls
6233 .iter()
6234 .any(|d| matches!(d, marque_ism::DissemControl::Nf)),
6235 "NOFORN must survive; attrs = {:?}",
6236 marking.0,
6237 );
6238 }
6239
6240 #[test]
6241 fn decoder_recovers_sarbp_missing_hyphen_via_sar_indicator_repair() {
6242 // End-to-end: `SARBP` (no hyphen) → `SAR-BP` (canonical) per
6243 // §H.5 p100. Pinned per
6244 // `tests/fixtures/mangled/typo/fbf5ed813c109c14.json`.
6245 let rx = DecoderRecognizer::new();
6246 let Parsed::Unambiguous(marking) = rx.recognize(b"TOP SECRET//SARBP//NOFORN", &deep_cx())
6247 else {
6248 panic!("SARBP must resolve via SAR indicator repair");
6249 };
6250 assert_eq!(
6251 marking
6252 .0
6253 .classification
6254 .as_ref()
6255 .map(|c| c.effective_level()),
6256 Some(Classification::TopSecret),
6257 );
6258 let sar = marking
6259 .0
6260 .sar_markings
6261 .as_ref()
6262 .expect("SAR block must be present");
6263 assert_eq!(sar.programs.len(), 1, "exactly one program; got {sar:?}");
6264 assert_eq!(
6265 &*sar.programs[0].identifier, "BP",
6266 "program identifier must be `BP` after hyphen insertion; got {sar:?}",
6267 );
6268 }
6269
6270 #[test]
6271 fn decoder_recovers_spcial_via_extended_correction_vocab() {
6272 // `SPCIAL` (typo in `SPECIAL`) — issue #133 PR 6 vocab
6273 // addition. The fuzzy matcher now finds `SPECIAL` at edit
6274 // distance 1, the strict SAR parser then matches the
6275 // `SPECIAL ACCESS REQUIRED-BUTTER POPCORN` indicator
6276 // literally. Pinned per
6277 // `tests/fixtures/mangled/typo/1f75ddd89b432949.json`.
6278 let rx = DecoderRecognizer::new();
6279 let Parsed::Unambiguous(marking) = rx.recognize(
6280 b"TOP SECRET//SPCIAL ACCESS REQUIRED-BUTTER POPCORN//NOFORN",
6281 &deep_cx(),
6282 ) else {
6283 panic!("SPCIAL must fuzzy-correct to SPECIAL");
6284 };
6285 assert_eq!(
6286 marking
6287 .0
6288 .classification
6289 .as_ref()
6290 .map(|c| c.effective_level()),
6291 Some(Classification::TopSecret),
6292 );
6293 let sar = marking
6294 .0
6295 .sar_markings
6296 .as_ref()
6297 .expect("SAR block must be present");
6298 assert_eq!(
6299 &*sar.programs[0].identifier, "BUTTER POPCORN",
6300 "Full-form program identifier must round-trip; got {sar:?}",
6301 );
6302 }
6303}